From a24595f2093c42d585a654cdfb261d82fc4c5645 Mon Sep 17 00:00:00 2001 From: Bastian Germann Date: Mon, 9 Feb 2026 12:01:04 +0100 Subject: [PATCH] Import python-pcre2_0.6.0+ds.orig.tar.xz [dgit import orig python-pcre2_0.6.0+ds.orig.tar.xz] --- CMakeLists.txt | 29 + LICENSE | 29 + Makefile | 27 + PKG-INFO | 150 ++ README.md | 116 ++ pyproject.toml | 8 + requirements/build-requirements.txt | 6 + requirements/test-requirements.txt | 3 + setup.cfg | 4 + setup.py | 48 + src/pcre2.egg-info/PKG-INFO | 150 ++ src/pcre2.egg-info/SOURCES.txt | 642 ++++++++ src/pcre2.egg-info/dependency_links.txt | 1 + src/pcre2.egg-info/top_level.txt | 1 + src/pcre2/CMakeLists.txt | 34 + src/pcre2/__init__.py | 497 ++++++ src/pcre2/_cy.pyx | 590 +++++++ src/pcre2/_libpcre2.pxd | 500 ++++++ tests/test_groups.py | 14 + tests/test_match.py | 58 + tests/test_pattern.py | 237 +++ tests/test_re_compatibility.py | 1918 +++++++++++++++++++++++ 22 files changed, 5062 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 PKG-INFO create mode 100755 README.md create mode 100755 pyproject.toml create mode 100644 requirements/build-requirements.txt create mode 100644 requirements/test-requirements.txt create mode 100644 setup.cfg create mode 100755 setup.py create mode 100644 src/pcre2.egg-info/PKG-INFO create mode 100644 src/pcre2.egg-info/SOURCES.txt create mode 100644 src/pcre2.egg-info/dependency_links.txt create mode 100644 src/pcre2.egg-info/top_level.txt create mode 100644 src/pcre2/CMakeLists.txt create mode 100755 src/pcre2/__init__.py create mode 100644 src/pcre2/_cy.pyx create mode 100755 src/pcre2/_libpcre2.pxd create mode 100644 tests/test_groups.py create mode 100644 tests/test_match.py create mode 100644 tests/test_pattern.py create mode 100644 tests/test_re_compatibility.py diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..2dddcfb --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.7.2) + +project(pcre2) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(CMAKE_C_STANDARD 99) + +set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2/interface) +set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC) + +# Set PCRE2 options. +set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE) +set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE) + +# Always make a release build. +set(CMAKE_BUILD_TYPE Release) + +# Build PCRE2 library as both shared and static. +set(BUILD_STATIC_LIBS ON) +set(BUILD_SHARED_LIBS ON) +add_subdirectory(src/libpcre2) + +# Build Cython code as shared. +set(BUILD_STATIC_LIBS OFF) +set(BUILD_SHARED_LIBS ON) +add_subdirectory(src/pcre2) + +# Include PCRE2 header for Cython API. +install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4a57011 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, grtetrault +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8755fcc --- /dev/null +++ b/Makefile @@ -0,0 +1,27 @@ +SHELL = /bin/bash + +init: + git submodule update --init --recursive + python3 -m venv ./.venv + ./.venv/bin/pip install -r ./requirements/build-requirements.txt + ./.venv/bin/pip install -r ./requirements/test-requirements.txt + ./.venv/bin/pip install . + +build: + ./.venv/bin/pip install . --force-reinstall + +clean: + rm -rf ./dist + rm -rf ./build + rm -rf ./_skbuild + find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf + find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf + find . -type f -name '*.pyc' | xargs rm -r + find . -type d -name '*.egg-info' | xargs rm -r + find . -type d -name '*.ipynb_checkpoints' | xargs rm -r + +purge: + rm -rf ./.venv + +benchmark: + ./.venv/bin/python ./benchmarks/run_regex_redux.py diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..b186c2a --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,150 @@ +Metadata-Version: 2.4 +Name: pcre2 +Version: 0.6.0 +Summary: Python bindings for the PCRE2 regular expression library +Home-page: https://github.com/grtetrault/pcre2.py +Author: Garrett Tetrault +License: BSD 3-Clause License +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: C +Classifier: Programming Language :: Cython +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: Microsoft :: Windows +Description-Content-Type: text/markdown +License-File: LICENSE +Dynamic: author +Dynamic: classifier +Dynamic: description +Dynamic: description-content-type +Dynamic: home-page +Dynamic: license +Dynamic: license-file +Dynamic: summary + +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +This library aims to be compatible with Python's built-in `re` module. In many cases, this means +that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below). +However, PCRE2 and Python implement different regex specifications, so patterns and behavior will +not always be translatable (e.g., the syntax for group replacement differs). + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and +bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and +can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit +True +>>> patn.groupindex +{'head': 1, 'tail': 2} +>>> patn.flags + +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match[0] +'foo bar' +>>> match.span() +(0, 7) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.sub(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.sub(repl, subj, count=1) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.finditer(subj): +... print(match.group('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 | +| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 | +| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 | +| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 | +| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `re_vanilla.py` | Pure Python version | +| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` | +| `pcre2_module.py` | More optimized implementation using `pcre2` | +| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/README.md b/README.md new file mode 100755 index 0000000..9cb6b16 --- /dev/null +++ b/README.md @@ -0,0 +1,116 @@ +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +This library aims to be compatible with Python's built-in `re` module. In many cases, this means +that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below). +However, PCRE2 and Python implement different regex specifications, so patterns and behavior will +not always be translatable (e.g., the syntax for group replacement differs). + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and +bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and +can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit +True +>>> patn.groupindex +{'head': 1, 'tail': 2} +>>> patn.flags + +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match[0] +'foo bar' +>>> match.span() +(0, 7) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.sub(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.sub(repl, subj, count=1) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.finditer(subj): +... print(match.group('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 | +| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 | +| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 | +| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 | +| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `re_vanilla.py` | Pure Python version | +| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` | +| `pcre2_module.py` | More optimized implementation using `pcre2` | +| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 0000000..c0f420a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +requires = [ + "setuptools>=42", + "scikit-build", + "Cython", + "cmake" +] +build-backend = "setuptools.build_meta" diff --git a/requirements/build-requirements.txt b/requirements/build-requirements.txt new file mode 100644 index 0000000..067a22d --- /dev/null +++ b/requirements/build-requirements.txt @@ -0,0 +1,6 @@ +requests +build +wheel +scikit-build +cmake +Cython \ No newline at end of file diff --git a/requirements/test-requirements.txt b/requirements/test-requirements.txt new file mode 100644 index 0000000..209b771 --- /dev/null +++ b/requirements/test-requirements.txt @@ -0,0 +1,3 @@ +twine +pytest +gitpython \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..ddd7ba2 --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +# -*- coding:utf-8 -*- + +import os +import skbuild +import setuptools + + +def get_long_desciption(): + cwd = os.path.abspath(os.path.dirname(__file__)) + filename = os.path.join(cwd, "README.md") + with open(filename) as f: + long_description = f.read() + + return long_description + + +skbuild.setup( + name="pcre2", + version="0.6.0", + description="Python bindings for the PCRE2 regular expression library", + long_description=get_long_desciption(), + long_description_content_type="text/markdown", + license="BSD 3-Clause License", + author="Garrett Tetrault", + url="https://github.com/grtetrault/pcre2.py", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Programming Language :: C", + "Programming Language :: Cython", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + ], + include_package_data=True, + packages=setuptools.find_packages("src"), + package_dir={"": "src"}, + cmake_languages="C", +) diff --git a/src/pcre2.egg-info/PKG-INFO b/src/pcre2.egg-info/PKG-INFO new file mode 100644 index 0000000..b186c2a --- /dev/null +++ b/src/pcre2.egg-info/PKG-INFO @@ -0,0 +1,150 @@ +Metadata-Version: 2.4 +Name: pcre2 +Version: 0.6.0 +Summary: Python bindings for the PCRE2 regular expression library +Home-page: https://github.com/grtetrault/pcre2.py +Author: Garrett Tetrault +License: BSD 3-Clause License +Classifier: Development Status :: 4 - Beta +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: C +Classifier: Programming Language :: Cython +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: 3.14 +Classifier: Operating System :: MacOS :: MacOS X +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: Microsoft :: Windows +Description-Content-Type: text/markdown +License-File: LICENSE +Dynamic: author +Dynamic: classifier +Dynamic: description +Dynamic: description-content-type +Dynamic: home-page +Dynamic: license +Dynamic: license-file +Dynamic: summary + +# PCRE2.py: Python bindings for the PCRE2 regular expression library + +This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2). +PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel. +For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2). + +## Installation + +From PyPI: +``` +pip install pcre2 +``` + +If a wheel is not available for your platform, the module will be built from source. +Building requires: + +* `cmake` +* C compiler toolchain, such as `gcc` and `make` +* `libtool` +* Python headers + +## Usage + +This library aims to be compatible with Python's built-in `re` module. In many cases, this means +that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below). +However, PCRE2 and Python implement different regex specifications, so patterns and behavior will +not always be translatable (e.g., the syntax for group replacement differs). + +Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and +bytes-like objects. +This returns a `Pattern` object. +Expressions can be compiled with a number of options (combined with the bitwise-or operator) and +can be JIT compiled, + +```python +>>> import pcre2 +>>> expr = r'(?\w+)\s+(?\w+)' +>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True) +>>> # Patterns can also be JIT compiled after initialization. +>>> patn.jit_compile() +``` + +Inspection of `Pattern` objects is done as follows, + +```python +>>> patn.jit +True +>>> patn.groupindex +{'head': 1, 'tail': 2} +>>> patn.flags + +``` + +Once compiled, `Pattern` objects can be used to match against strings. +Matching return a `Match` object, which has several functions to view results, + +```python +>>> subj = 'foo bar buzz bazz' +>>> match = patn.match(subj) +>>> match[0] +'foo bar' +>>> match.span() +(0, 7) +``` + +Substitution is also supported, both from `Pattern` and `Match` objects, + +```python +>>> repl = '$2 $1' +>>> patn.sub(repl, subj) # Global substitutions by default. +'bar foo bazz buzz' +>>> patn.sub(repl, subj, count=1) +'bar foo buzz bazz' +>>> match.expand(repl) +'bar foo' +``` + +Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches, + +```python +>>> for match in patn.finditer(subj): +... print(match.group('head')) +... +foo +buzz +``` + +## Performance + +PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled. +Below are the `regex-redux` benchmark results included in this repository, + +| Script | Number of runs | Total time | Real time | User time | System time | +| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- | +| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 | +| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 | +| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 | +| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 | +| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 | + +Script descriptions are as follows, + +| Script | Description | +| ------------------- | -------------------------------------------------------------------- | +| `baseline.py` | Reads input file and outputs stored expected output | +| `re_vanilla.py` | Pure Python version | +| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` | +| `pcre2_module.py` | More optimized implementation using `pcre2` | +| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library | + +Tests were performed on an M2 Macbook Air. +Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset. +Additionally, a Python virtual environment must be created, and the package built +with `make init` and `make build` respectively. +For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html). +See source code of benchmark scripts for details and original sources. diff --git a/src/pcre2.egg-info/SOURCES.txt b/src/pcre2.egg-info/SOURCES.txt new file mode 100644 index 0000000..e06b222 --- /dev/null +++ b/src/pcre2.egg-info/SOURCES.txt @@ -0,0 +1,642 @@ +CMakeLists.txt +LICENSE +Makefile +README.md +pyproject.toml +setup.py +requirements/build-requirements.txt +requirements/test-requirements.txt +src/libpcre2/.editorconfig +src/libpcre2/.git +src/libpcre2/.gitattributes +src/libpcre2/.gitignore +src/libpcre2/.gitmodules +src/libpcre2/AUTHORS.md +src/libpcre2/BUILD.bazel +src/libpcre2/CMakeLists.txt +src/libpcre2/COPYING +src/libpcre2/ChangeLog +src/libpcre2/HACKING +src/libpcre2/INSTALL +src/libpcre2/LICENCE.md +src/libpcre2/MODULE.bazel +src/libpcre2/Makefile.am +src/libpcre2/Makefile.in +src/libpcre2/NEWS +src/libpcre2/NON-AUTOTOOLS-BUILD +src/libpcre2/README +src/libpcre2/README.md +src/libpcre2/RunGrepTest +src/libpcre2/RunGrepTest.bat +src/libpcre2/RunTest +src/libpcre2/RunTest.bat +src/libpcre2/SECURITY.md +src/libpcre2/aclocal.m4 +src/libpcre2/ar-lib +src/libpcre2/autogen.sh +src/libpcre2/build.zig +src/libpcre2/compile +src/libpcre2/config.guess +src/libpcre2/config.sub +src/libpcre2/configure +src/libpcre2/configure.ac +src/libpcre2/depcomp +src/libpcre2/install-sh +src/libpcre2/libpcre2-16.pc.in +src/libpcre2/libpcre2-32.pc.in +src/libpcre2/libpcre2-8.pc.in +src/libpcre2/libpcre2-posix.pc.in +src/libpcre2/ltmain.sh +src/libpcre2/missing +src/libpcre2/pcre2-config.in +src/libpcre2/perltest.sh +src/libpcre2/test-driver +src/libpcre2/.github/codecov.yml +src/libpcre2/.github/dependabot.yml +src/libpcre2/.github/scripts/merge_sarif.py +src/libpcre2/.github/workflows/build.yml +src/libpcre2/.github/workflows/cifuzz.yml +src/libpcre2/.github/workflows/clang-analyzer.yml +src/libpcre2/.github/workflows/codeql.yml +src/libpcre2/.github/workflows/dev.yml +src/libpcre2/.github/workflows/pages.yml +src/libpcre2/.github/workflows/scorecards.yml +src/libpcre2/.github/workflows/sync.yml +src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS +src/libpcre2/cmake/FindEditline.cmake +src/libpcre2/cmake/FindReadline.cmake +src/libpcre2/cmake/PCRE2CheckVscript.cmake +src/libpcre2/cmake/PCRE2UseSystemExtensions.cmake +src/libpcre2/cmake/PCRE2WarningAsError.cmake +src/libpcre2/cmake/pcre2-config.cmake.in +src/libpcre2/deps/sljit/.git +src/libpcre2/deps/sljit/.gitignore +src/libpcre2/deps/sljit/API_CHANGES +src/libpcre2/deps/sljit/CMakeLists.txt +src/libpcre2/deps/sljit/GNUmakefile +src/libpcre2/deps/sljit/INTERNAL_CHANGES +src/libpcre2/deps/sljit/LICENSE +src/libpcre2/deps/sljit/README.md +src/libpcre2/deps/sljit/.github/workflows/actions.yml +src/libpcre2/deps/sljit/docs/README.md +src/libpcre2/deps/sljit/docs/general/architecture.md +src/libpcre2/deps/sljit/docs/general/contributing.md +src/libpcre2/deps/sljit/docs/general/introduction.md +src/libpcre2/deps/sljit/docs/general/getting-started/_category_.json +src/libpcre2/deps/sljit/docs/general/getting-started/configuration.md +src/libpcre2/deps/sljit/docs/general/getting-started/setup.md +src/libpcre2/deps/sljit/docs/general/use-cases/_category_.json +src/libpcre2/deps/sljit/docs/general/use-cases/bytecode-interpreters.md +src/libpcre2/deps/sljit/docs/general/use-cases/overview.md +src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/_category_.json +src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/figure1.svg +src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/performance-comparison.md +src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/regular-expression-engine-types.md +src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/speeding-up-pcre2-with-sljit.md +src/libpcre2/deps/sljit/docs/tutorial/01-overview.md +src/libpcre2/deps/sljit/docs/tutorial/02-your-first-program.md +src/libpcre2/deps/sljit/docs/tutorial/03-branching.md +src/libpcre2/deps/sljit/docs/tutorial/04-calling-external-functions.md +src/libpcre2/deps/sljit/docs/tutorial/05-accessing-structures.md +src/libpcre2/deps/sljit/docs/tutorial/06-accessing-arrays.md +src/libpcre2/deps/sljit/docs/tutorial/07-local-variables.md +src/libpcre2/deps/sljit/docs/tutorial/08-where-to-go-from-here.md +src/libpcre2/deps/sljit/docs/tutorial/sources/99bottles.bf +src/libpcre2/deps/sljit/docs/tutorial/sources/array_access.c +src/libpcre2/deps/sljit/docs/tutorial/sources/brainfuck.c +src/libpcre2/deps/sljit/docs/tutorial/sources/branch.c +src/libpcre2/deps/sljit/docs/tutorial/sources/first_program.c +src/libpcre2/deps/sljit/docs/tutorial/sources/func_call.c +src/libpcre2/deps/sljit/docs/tutorial/sources/hello.bf +src/libpcre2/deps/sljit/docs/tutorial/sources/loop.c +src/libpcre2/deps/sljit/docs/tutorial/sources/struct_access.c +src/libpcre2/deps/sljit/docs/tutorial/sources/temp_var.c +src/libpcre2/deps/sljit/docs/website/.gitignore +src/libpcre2/deps/sljit/docs/website/README.md +src/libpcre2/deps/sljit/docs/website/docusaurus.config.js +src/libpcre2/deps/sljit/docs/website/package-lock.json +src/libpcre2/deps/sljit/docs/website/package.json +src/libpcre2/deps/sljit/docs/website/sidebars.js +src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/index.js +src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/styles.module.css +src/libpcre2/deps/sljit/docs/website/src/css/custom.css +src/libpcre2/deps/sljit/docs/website/src/pages/index.js +src/libpcre2/deps/sljit/docs/website/src/pages/index.module.css +src/libpcre2/deps/sljit/docs/website/static/.nojekyll +src/libpcre2/deps/sljit/docs/website/static/assets/regex-test.tgz +src/libpcre2/deps/sljit/regex_src/regexJIT.c +src/libpcre2/deps/sljit/regex_src/regexJIT.h +src/libpcre2/deps/sljit/regex_src/regexMain.c +src/libpcre2/deps/sljit/sljit_src/sljitConfig.h +src/libpcre2/deps/sljit/sljit_src/sljitConfigCPU.h +src/libpcre2/deps/sljit/sljit_src/sljitConfigInternal.h +src/libpcre2/deps/sljit/sljit_src/sljitLir.c +src/libpcre2/deps/sljit/sljit_src/sljitLir.h +src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_T2_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeLOONGARCH_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_common.c +src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_common.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_common.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeS390X.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_32.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_64.c +src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_common.c +src/libpcre2/deps/sljit/sljit_src/sljitSerialize.c +src/libpcre2/deps/sljit/sljit_src/sljitUtils.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c +src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c +src/libpcre2/deps/sljit/test_src/sljitConfigPost.h +src/libpcre2/deps/sljit/test_src/sljitConfigPre.h +src/libpcre2/deps/sljit/test_src/sljitMain.c +src/libpcre2/deps/sljit/test_src/sljitTest.c +src/libpcre2/deps/sljit/test_src/sljitTestBuffers.h +src/libpcre2/deps/sljit/test_src/sljitTestCall.h +src/libpcre2/deps/sljit/test_src/sljitTestFloat.h +src/libpcre2/deps/sljit/test_src/sljitTestSerialize.h +src/libpcre2/deps/sljit/test_src/sljitTestSimd.h +src/libpcre2/doc/index.html.src +src/libpcre2/doc/pcre2-config.1 +src/libpcre2/doc/pcre2-config.txt +src/libpcre2/doc/pcre2.3 +src/libpcre2/doc/pcre2.txt +src/libpcre2/doc/pcre2_callout_enumerate.3 +src/libpcre2/doc/pcre2_code_copy.3 +src/libpcre2/doc/pcre2_code_copy_with_tables.3 +src/libpcre2/doc/pcre2_code_free.3 +src/libpcre2/doc/pcre2_compile.3 +src/libpcre2/doc/pcre2_compile_context_copy.3 +src/libpcre2/doc/pcre2_compile_context_create.3 +src/libpcre2/doc/pcre2_compile_context_free.3 +src/libpcre2/doc/pcre2_config.3 +src/libpcre2/doc/pcre2_convert_context_copy.3 +src/libpcre2/doc/pcre2_convert_context_create.3 +src/libpcre2/doc/pcre2_convert_context_free.3 +src/libpcre2/doc/pcre2_converted_pattern_free.3 +src/libpcre2/doc/pcre2_dfa_match.3 +src/libpcre2/doc/pcre2_general_context_copy.3 +src/libpcre2/doc/pcre2_general_context_create.3 +src/libpcre2/doc/pcre2_general_context_free.3 +src/libpcre2/doc/pcre2_get_error_message.3 +src/libpcre2/doc/pcre2_get_mark.3 +src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3 +src/libpcre2/doc/pcre2_get_match_data_size.3 +src/libpcre2/doc/pcre2_get_ovector_count.3 +src/libpcre2/doc/pcre2_get_ovector_pointer.3 +src/libpcre2/doc/pcre2_get_startchar.3 +src/libpcre2/doc/pcre2_jit_compile.3 +src/libpcre2/doc/pcre2_jit_free_unused_memory.3 +src/libpcre2/doc/pcre2_jit_match.3 +src/libpcre2/doc/pcre2_jit_stack_assign.3 +src/libpcre2/doc/pcre2_jit_stack_create.3 +src/libpcre2/doc/pcre2_jit_stack_free.3 +src/libpcre2/doc/pcre2_maketables.3 +src/libpcre2/doc/pcre2_maketables_free.3 +src/libpcre2/doc/pcre2_match.3 +src/libpcre2/doc/pcre2_match_context_copy.3 +src/libpcre2/doc/pcre2_match_context_create.3 +src/libpcre2/doc/pcre2_match_context_free.3 +src/libpcre2/doc/pcre2_match_data_create.3 +src/libpcre2/doc/pcre2_match_data_create_from_pattern.3 +src/libpcre2/doc/pcre2_match_data_free.3 +src/libpcre2/doc/pcre2_next_match.3 +src/libpcre2/doc/pcre2_pattern_convert.3 +src/libpcre2/doc/pcre2_pattern_info.3 +src/libpcre2/doc/pcre2_serialize_decode.3 +src/libpcre2/doc/pcre2_serialize_encode.3 +src/libpcre2/doc/pcre2_serialize_free.3 +src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3 +src/libpcre2/doc/pcre2_set_bsr.3 +src/libpcre2/doc/pcre2_set_callout.3 +src/libpcre2/doc/pcre2_set_character_tables.3 +src/libpcre2/doc/pcre2_set_compile_extra_options.3 +src/libpcre2/doc/pcre2_set_compile_recursion_guard.3 +src/libpcre2/doc/pcre2_set_depth_limit.3 +src/libpcre2/doc/pcre2_set_glob_escape.3 +src/libpcre2/doc/pcre2_set_glob_separator.3 +src/libpcre2/doc/pcre2_set_heap_limit.3 +src/libpcre2/doc/pcre2_set_match_limit.3 +src/libpcre2/doc/pcre2_set_max_pattern_compiled_length.3 +src/libpcre2/doc/pcre2_set_max_pattern_length.3 +src/libpcre2/doc/pcre2_set_max_varlookbehind.3 +src/libpcre2/doc/pcre2_set_newline.3 +src/libpcre2/doc/pcre2_set_offset_limit.3 +src/libpcre2/doc/pcre2_set_optimize.3 +src/libpcre2/doc/pcre2_set_parens_nest_limit.3 +src/libpcre2/doc/pcre2_set_recursion_limit.3 +src/libpcre2/doc/pcre2_set_recursion_memory_management.3 +src/libpcre2/doc/pcre2_set_substitute_callout.3 +src/libpcre2/doc/pcre2_set_substitute_case_callout.3 +src/libpcre2/doc/pcre2_substitute.3 +src/libpcre2/doc/pcre2_substring_copy_byname.3 +src/libpcre2/doc/pcre2_substring_copy_bynumber.3 +src/libpcre2/doc/pcre2_substring_free.3 +src/libpcre2/doc/pcre2_substring_get_byname.3 +src/libpcre2/doc/pcre2_substring_get_bynumber.3 +src/libpcre2/doc/pcre2_substring_length_byname.3 +src/libpcre2/doc/pcre2_substring_length_bynumber.3 +src/libpcre2/doc/pcre2_substring_list_free.3 +src/libpcre2/doc/pcre2_substring_list_get.3 +src/libpcre2/doc/pcre2_substring_nametable_scan.3 +src/libpcre2/doc/pcre2_substring_number_from_name.3 +src/libpcre2/doc/pcre2api.3 +src/libpcre2/doc/pcre2build.3 +src/libpcre2/doc/pcre2callout.3 +src/libpcre2/doc/pcre2compat.3 +src/libpcre2/doc/pcre2convert.3 +src/libpcre2/doc/pcre2demo.3 +src/libpcre2/doc/pcre2grep.1 +src/libpcre2/doc/pcre2grep.txt +src/libpcre2/doc/pcre2jit.3 +src/libpcre2/doc/pcre2limits.3 +src/libpcre2/doc/pcre2matching.3 +src/libpcre2/doc/pcre2partial.3 +src/libpcre2/doc/pcre2pattern.3 +src/libpcre2/doc/pcre2perform.3 +src/libpcre2/doc/pcre2posix.3 +src/libpcre2/doc/pcre2sample.3 +src/libpcre2/doc/pcre2serialize.3 +src/libpcre2/doc/pcre2syntax.3 +src/libpcre2/doc/pcre2test.1 +src/libpcre2/doc/pcre2test.txt +src/libpcre2/doc/pcre2unicode.3 +src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt +src/libpcre2/doc/html/README.txt +src/libpcre2/doc/html/index.html +src/libpcre2/doc/html/pcre2-config.html +src/libpcre2/doc/html/pcre2.html +src/libpcre2/doc/html/pcre2_callout_enumerate.html +src/libpcre2/doc/html/pcre2_code_copy.html +src/libpcre2/doc/html/pcre2_code_copy_with_tables.html +src/libpcre2/doc/html/pcre2_code_free.html +src/libpcre2/doc/html/pcre2_compile.html +src/libpcre2/doc/html/pcre2_compile_context_copy.html +src/libpcre2/doc/html/pcre2_compile_context_create.html +src/libpcre2/doc/html/pcre2_compile_context_free.html +src/libpcre2/doc/html/pcre2_config.html +src/libpcre2/doc/html/pcre2_convert_context_copy.html +src/libpcre2/doc/html/pcre2_convert_context_create.html +src/libpcre2/doc/html/pcre2_convert_context_free.html +src/libpcre2/doc/html/pcre2_converted_pattern_free.html +src/libpcre2/doc/html/pcre2_dfa_match.html +src/libpcre2/doc/html/pcre2_general_context_copy.html +src/libpcre2/doc/html/pcre2_general_context_create.html +src/libpcre2/doc/html/pcre2_general_context_free.html +src/libpcre2/doc/html/pcre2_get_error_message.html +src/libpcre2/doc/html/pcre2_get_mark.html +src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html +src/libpcre2/doc/html/pcre2_get_match_data_size.html +src/libpcre2/doc/html/pcre2_get_ovector_count.html +src/libpcre2/doc/html/pcre2_get_ovector_pointer.html +src/libpcre2/doc/html/pcre2_get_startchar.html +src/libpcre2/doc/html/pcre2_jit_compile.html +src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html +src/libpcre2/doc/html/pcre2_jit_match.html +src/libpcre2/doc/html/pcre2_jit_stack_assign.html +src/libpcre2/doc/html/pcre2_jit_stack_create.html +src/libpcre2/doc/html/pcre2_jit_stack_free.html +src/libpcre2/doc/html/pcre2_maketables.html +src/libpcre2/doc/html/pcre2_maketables_free.html +src/libpcre2/doc/html/pcre2_match.html +src/libpcre2/doc/html/pcre2_match_context_copy.html +src/libpcre2/doc/html/pcre2_match_context_create.html +src/libpcre2/doc/html/pcre2_match_context_free.html +src/libpcre2/doc/html/pcre2_match_data_create.html +src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html +src/libpcre2/doc/html/pcre2_match_data_free.html +src/libpcre2/doc/html/pcre2_next_match.html +src/libpcre2/doc/html/pcre2_pattern_convert.html +src/libpcre2/doc/html/pcre2_pattern_info.html +src/libpcre2/doc/html/pcre2_serialize_decode.html +src/libpcre2/doc/html/pcre2_serialize_encode.html +src/libpcre2/doc/html/pcre2_serialize_free.html +src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html +src/libpcre2/doc/html/pcre2_set_bsr.html +src/libpcre2/doc/html/pcre2_set_callout.html +src/libpcre2/doc/html/pcre2_set_character_tables.html +src/libpcre2/doc/html/pcre2_set_compile_extra_options.html +src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html +src/libpcre2/doc/html/pcre2_set_depth_limit.html +src/libpcre2/doc/html/pcre2_set_glob_escape.html +src/libpcre2/doc/html/pcre2_set_glob_separator.html +src/libpcre2/doc/html/pcre2_set_heap_limit.html +src/libpcre2/doc/html/pcre2_set_match_limit.html +src/libpcre2/doc/html/pcre2_set_max_pattern_compiled_length.html +src/libpcre2/doc/html/pcre2_set_max_pattern_length.html +src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html +src/libpcre2/doc/html/pcre2_set_newline.html +src/libpcre2/doc/html/pcre2_set_offset_limit.html +src/libpcre2/doc/html/pcre2_set_optimize.html +src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html +src/libpcre2/doc/html/pcre2_set_recursion_limit.html +src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html +src/libpcre2/doc/html/pcre2_set_substitute_callout.html +src/libpcre2/doc/html/pcre2_set_substitute_case_callout.html +src/libpcre2/doc/html/pcre2_substitute.html +src/libpcre2/doc/html/pcre2_substring_copy_byname.html +src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html +src/libpcre2/doc/html/pcre2_substring_free.html +src/libpcre2/doc/html/pcre2_substring_get_byname.html +src/libpcre2/doc/html/pcre2_substring_get_bynumber.html +src/libpcre2/doc/html/pcre2_substring_length_byname.html +src/libpcre2/doc/html/pcre2_substring_length_bynumber.html +src/libpcre2/doc/html/pcre2_substring_list_free.html +src/libpcre2/doc/html/pcre2_substring_list_get.html +src/libpcre2/doc/html/pcre2_substring_nametable_scan.html +src/libpcre2/doc/html/pcre2_substring_number_from_name.html +src/libpcre2/doc/html/pcre2api.html +src/libpcre2/doc/html/pcre2build.html +src/libpcre2/doc/html/pcre2callout.html +src/libpcre2/doc/html/pcre2compat.html +src/libpcre2/doc/html/pcre2convert.html +src/libpcre2/doc/html/pcre2demo.html +src/libpcre2/doc/html/pcre2grep.html +src/libpcre2/doc/html/pcre2jit.html +src/libpcre2/doc/html/pcre2limits.html +src/libpcre2/doc/html/pcre2matching.html +src/libpcre2/doc/html/pcre2partial.html +src/libpcre2/doc/html/pcre2pattern.html +src/libpcre2/doc/html/pcre2perform.html +src/libpcre2/doc/html/pcre2posix.html +src/libpcre2/doc/html/pcre2sample.html +src/libpcre2/doc/html/pcre2serialize.html +src/libpcre2/doc/html/pcre2syntax.html +src/libpcre2/doc/html/pcre2test.html +src/libpcre2/doc/html/pcre2unicode.html +src/libpcre2/m4/ax_check_vscript.m4 +src/libpcre2/m4/ax_pthread.m4 +src/libpcre2/m4/libtool.m4 +src/libpcre2/m4/ltoptions.m4 +src/libpcre2/m4/ltsugar.m4 +src/libpcre2/m4/ltversion.m4 +src/libpcre2/m4/lt~obsolete.m4 +src/libpcre2/m4/pcre2_visibility.m4 +src/libpcre2/m4/pcre2_zos.m4 +src/libpcre2/maint/.gitignore +src/libpcre2/maint/132html +src/libpcre2/maint/CheckMan +src/libpcre2/maint/CheckTxt +src/libpcre2/maint/CleanTxt +src/libpcre2/maint/Detrail +src/libpcre2/maint/FilterCoverage.py +src/libpcre2/maint/GenerateCommon.py +src/libpcre2/maint/GenerateTest.py +src/libpcre2/maint/GenerateUcd.py +src/libpcre2/maint/GenerateUcpHeader.py +src/libpcre2/maint/GenerateUcpTables.py +src/libpcre2/maint/LintMan +src/libpcre2/maint/ManyConfigTests +src/libpcre2/maint/README +src/libpcre2/maint/RunCoverage +src/libpcre2/maint/RunManifestTest +src/libpcre2/maint/RunManifestTest.ps1 +src/libpcre2/maint/RunPerlTest +src/libpcre2/maint/RunSymbolTest +src/libpcre2/maint/RunSymbolTest.ps1 +src/libpcre2/maint/UpdateAlways +src/libpcre2/maint/UpdateCommon.py +src/libpcre2/maint/UpdateDates.py +src/libpcre2/maint/UpdateRelease.py +src/libpcre2/maint/manifest-cmakeinstall-freebsd +src/libpcre2/maint/manifest-cmakeinstall-linux +src/libpcre2/maint/manifest-cmakeinstall-macos +src/libpcre2/maint/manifest-cmakeinstall-solaris +src/libpcre2/maint/manifest-cmakeinstall-windows +src/libpcre2/maint/manifest-libpcre2-16.so +src/libpcre2/maint/manifest-libpcre2-32.so +src/libpcre2/maint/manifest-libpcre2-8.so +src/libpcre2/maint/manifest-libpcre2-posix.so +src/libpcre2/maint/manifest-makeinstall-freebsd +src/libpcre2/maint/manifest-makeinstall-linux +src/libpcre2/maint/manifest-makeinstall-solaris +src/libpcre2/maint/manifest-tarball +src/libpcre2/maint/pcre2_chartables.c.non-standard +src/libpcre2/maint/ucptest.c +src/libpcre2/maint/utf8.c +src/libpcre2/maint/Unicode.tables/BidiMirroring.txt +src/libpcre2/maint/Unicode.tables/CaseFolding.txt +src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt +src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt +src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt +src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt +src/libpcre2/maint/Unicode.tables/PropList.txt +src/libpcre2/maint/Unicode.tables/PropertyAliases.txt +src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt +src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt +src/libpcre2/maint/Unicode.tables/Scripts.txt +src/libpcre2/maint/Unicode.tables/UnicodeData.txt +src/libpcre2/maint/Unicode.tables/emoji-data.txt +src/libpcre2/maint/cmake-tests/build-interface/CMakeLists.txt +src/libpcre2/maint/cmake-tests/build-interface/main.c +src/libpcre2/maint/cmake-tests/install-interface/CMakeLists.txt +src/libpcre2/maint/cmake-tests/install-interface/main.c +src/libpcre2/maint/ucptestdata/testinput1 +src/libpcre2/maint/ucptestdata/testinput2 +src/libpcre2/maint/ucptestdata/testoutput1 +src/libpcre2/maint/ucptestdata/testoutput2 +src/libpcre2/src/config-cmake.h.in +src/libpcre2/src/config.h.generic +src/libpcre2/src/config.h.in +src/libpcre2/src/libpcre2-16.sym +src/libpcre2/src/libpcre2-32.sym +src/libpcre2/src/libpcre2-8.sym +src/libpcre2/src/libpcre2-posix.sym +src/libpcre2/src/pcre2.h.generic +src/libpcre2/src/pcre2.h.in +src/libpcre2/src/pcre2_auto_possess.c +src/libpcre2/src/pcre2_chartables.c.dist +src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl15 +src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl25 +src/libpcre2/src/pcre2_chkdint.c +src/libpcre2/src/pcre2_compile.c +src/libpcre2/src/pcre2_compile.h +src/libpcre2/src/pcre2_compile_cgroup.c +src/libpcre2/src/pcre2_compile_class.c +src/libpcre2/src/pcre2_config.c +src/libpcre2/src/pcre2_context.c +src/libpcre2/src/pcre2_convert.c +src/libpcre2/src/pcre2_dfa_match.c +src/libpcre2/src/pcre2_dftables.c +src/libpcre2/src/pcre2_error.c +src/libpcre2/src/pcre2_extuni.c +src/libpcre2/src/pcre2_find_bracket.c +src/libpcre2/src/pcre2_fuzzsupport.c +src/libpcre2/src/pcre2_internal.h +src/libpcre2/src/pcre2_intmodedep.h +src/libpcre2/src/pcre2_jit_char_inc.h +src/libpcre2/src/pcre2_jit_compile.c +src/libpcre2/src/pcre2_jit_match_inc.h +src/libpcre2/src/pcre2_jit_misc_inc.h +src/libpcre2/src/pcre2_jit_simd_inc.h +src/libpcre2/src/pcre2_jit_test.c +src/libpcre2/src/pcre2_maketables.c +src/libpcre2/src/pcre2_match.c +src/libpcre2/src/pcre2_match_data.c +src/libpcre2/src/pcre2_match_next.c +src/libpcre2/src/pcre2_newline.c +src/libpcre2/src/pcre2_ord2utf.c +src/libpcre2/src/pcre2_pattern_info.c +src/libpcre2/src/pcre2_printint_inc.h +src/libpcre2/src/pcre2_script_run.c +src/libpcre2/src/pcre2_serialize.c +src/libpcre2/src/pcre2_string_utils.c +src/libpcre2/src/pcre2_study.c +src/libpcre2/src/pcre2_substitute.c +src/libpcre2/src/pcre2_substring.c +src/libpcre2/src/pcre2_tables.c +src/libpcre2/src/pcre2_ucd.c +src/libpcre2/src/pcre2_ucp.h +src/libpcre2/src/pcre2_ucptables_inc.h +src/libpcre2/src/pcre2_util.h +src/libpcre2/src/pcre2_valid_utf.c +src/libpcre2/src/pcre2_xclass.c +src/libpcre2/src/pcre2demo.c +src/libpcre2/src/pcre2grep.c +src/libpcre2/src/pcre2posix.c +src/libpcre2/src/pcre2posix.h +src/libpcre2/src/pcre2posix_test.c +src/libpcre2/src/pcre2test.c +src/libpcre2/src/pcre2test_inc.h +src/libpcre2/testdata/grepbinary +src/libpcre2/testdata/grepfilelist +src/libpcre2/testdata/grepinput +src/libpcre2/testdata/grepinput3 +src/libpcre2/testdata/grepinput8 +src/libpcre2/testdata/grepinputBad8 +src/libpcre2/testdata/grepinputBad8_Trail +src/libpcre2/testdata/grepinputC.bz2 +src/libpcre2/testdata/grepinputC.gz +src/libpcre2/testdata/grepinputM +src/libpcre2/testdata/grepinputUN +src/libpcre2/testdata/grepinputv +src/libpcre2/testdata/grepinputx +src/libpcre2/testdata/greplist +src/libpcre2/testdata/greplistBad +src/libpcre2/testdata/grepnot.bz2 +src/libpcre2/testdata/grepoutput +src/libpcre2/testdata/grepoutput8 +src/libpcre2/testdata/grepoutputC +src/libpcre2/testdata/grepoutputCN +src/libpcre2/testdata/grepoutputCNU +src/libpcre2/testdata/grepoutputCU +src/libpcre2/testdata/grepoutputCbz2 +src/libpcre2/testdata/grepoutputCgz +src/libpcre2/testdata/grepoutputN +src/libpcre2/testdata/grepoutputUN +src/libpcre2/testdata/greppatN4 +src/libpcre2/testdata/testbtables +src/libpcre2/testdata/testinput1 +src/libpcre2/testdata/testinput10 +src/libpcre2/testdata/testinput11 +src/libpcre2/testdata/testinput12 +src/libpcre2/testdata/testinput13 +src/libpcre2/testdata/testinput14 +src/libpcre2/testdata/testinput15 +src/libpcre2/testdata/testinput16 +src/libpcre2/testdata/testinput17 +src/libpcre2/testdata/testinput18 +src/libpcre2/testdata/testinput19 +src/libpcre2/testdata/testinput2 +src/libpcre2/testdata/testinput20 +src/libpcre2/testdata/testinput21 +src/libpcre2/testdata/testinput22 +src/libpcre2/testdata/testinput23 +src/libpcre2/testdata/testinput24 +src/libpcre2/testdata/testinput25 +src/libpcre2/testdata/testinput26 +src/libpcre2/testdata/testinput27 +src/libpcre2/testdata/testinput28 +src/libpcre2/testdata/testinput29 +src/libpcre2/testdata/testinput3 +src/libpcre2/testdata/testinput4 +src/libpcre2/testdata/testinput5 +src/libpcre2/testdata/testinput6 +src/libpcre2/testdata/testinput7 +src/libpcre2/testdata/testinput8 +src/libpcre2/testdata/testinput9 +src/libpcre2/testdata/testinputheap +src/libpcre2/testdata/testoutput1 +src/libpcre2/testdata/testoutput10 +src/libpcre2/testdata/testoutput11-16 +src/libpcre2/testdata/testoutput11-32 +src/libpcre2/testdata/testoutput12-16 +src/libpcre2/testdata/testoutput12-32 +src/libpcre2/testdata/testoutput13 +src/libpcre2/testdata/testoutput14-16 +src/libpcre2/testdata/testoutput14-32 +src/libpcre2/testdata/testoutput14-8 +src/libpcre2/testdata/testoutput15 +src/libpcre2/testdata/testoutput16 +src/libpcre2/testdata/testoutput17 +src/libpcre2/testdata/testoutput18 +src/libpcre2/testdata/testoutput19 +src/libpcre2/testdata/testoutput2 +src/libpcre2/testdata/testoutput20 +src/libpcre2/testdata/testoutput21 +src/libpcre2/testdata/testoutput22-16 +src/libpcre2/testdata/testoutput22-32 +src/libpcre2/testdata/testoutput22-8 +src/libpcre2/testdata/testoutput23 +src/libpcre2/testdata/testoutput24 +src/libpcre2/testdata/testoutput25 +src/libpcre2/testdata/testoutput26 +src/libpcre2/testdata/testoutput27 +src/libpcre2/testdata/testoutput28 +src/libpcre2/testdata/testoutput29 +src/libpcre2/testdata/testoutput3 +src/libpcre2/testdata/testoutput3A +src/libpcre2/testdata/testoutput3B +src/libpcre2/testdata/testoutput3C +src/libpcre2/testdata/testoutput4 +src/libpcre2/testdata/testoutput5 +src/libpcre2/testdata/testoutput6 +src/libpcre2/testdata/testoutput7 +src/libpcre2/testdata/testoutput8-16-2 +src/libpcre2/testdata/testoutput8-16-4 +src/libpcre2/testdata/testoutput8-32-4 +src/libpcre2/testdata/testoutput8-8-2 +src/libpcre2/testdata/testoutput8-8-3 +src/libpcre2/testdata/testoutput8-8-4 +src/libpcre2/testdata/testoutput9 +src/libpcre2/testdata/testoutputheap-16 +src/libpcre2/testdata/testoutputheap-32 +src/libpcre2/testdata/testoutputheap-8 +src/libpcre2/testdata/valgrind-jit.supp +src/libpcre2/testdata/wintestinput3 +src/libpcre2/testdata/wintestoutput3 +src/libpcre2/testdata/fuzzing/pcre2_fuzzer.dict +src/libpcre2/testdata/fuzzing/pcre2_fuzzer.options +src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.dict +src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.options +src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.dict +src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.options +src/libpcre2/vms/configure.com +src/libpcre2/vms/openvms_readme.txt +src/libpcre2/vms/pcre2.h_patch +src/libpcre2/vms/stdint.h +src/pcre2/CMakeLists.txt +src/pcre2/__init__.py +src/pcre2/_cy.pyx +src/pcre2/_libpcre2.pxd +src/pcre2.egg-info/PKG-INFO +src/pcre2.egg-info/SOURCES.txt +src/pcre2.egg-info/dependency_links.txt +src/pcre2.egg-info/top_level.txt +tests/test_groups.py +tests/test_match.py +tests/test_pattern.py +tests/test_re_compatibility.py \ No newline at end of file diff --git a/src/pcre2.egg-info/dependency_links.txt b/src/pcre2.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/pcre2.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/pcre2.egg-info/top_level.txt b/src/pcre2.egg-info/top_level.txt new file mode 100644 index 0000000..92d5e6d --- /dev/null +++ b/src/pcre2.egg-info/top_level.txt @@ -0,0 +1 @@ +pcre2 diff --git a/src/pcre2/CMakeLists.txt b/src/pcre2/CMakeLists.txt new file mode 100644 index 0000000..9508aee --- /dev/null +++ b/src/pcre2/CMakeLists.txt @@ -0,0 +1,34 @@ +find_package(Cython MODULE REQUIRED) +find_package(PythonExtensions MODULE REQUIRED) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# Build Cython with annotations. +set(CYTHON_ANNOTATE TRUE) + +# Macro to add Cython files as modules, configured to build with PCRE2. +macro(add_pyx_file filename) + add_cython_target(${filename} C PY3) + add_library(${filename} MODULE ${filename}) + python_extension_module(${filename}) + + target_link_libraries(${filename} pcre2-8-static) + target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR}) + target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS}) + + install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2) +endmacro() + +# GLOB pattern is recommended against, +# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem +add_pyx_file(_cy) + + +# Include .pyx and .pxd files in distribution for use by Cython API. +install( + FILES + _libpcre2.pxd + _cy.pyx + DESTINATION + src/pcre2 +) \ No newline at end of file diff --git a/src/pcre2/__init__.py b/src/pcre2/__init__.py new file mode 100755 index 0000000..732a764 --- /dev/null +++ b/src/pcre2/__init__.py @@ -0,0 +1,497 @@ +from . import _cy + +from enum import auto, IntFlag +import operator +from itertools import islice +from functools import lru_cache, reduce +from types import MappingProxyType +from sys import maxsize + +# The below implementation uses as a base that of Google`s RE2 Python bindings: +# https://github.com/google/re2/tree/main/python + + +# ============================================================================ +# Constants + +__version__ = "0.6.0" +__libpcre2_version__ = _cy.__libpcre2_version__ + + +class RegexFlag(IntFlag): + # Flags either enable (True) or disable (False) PCRE2 options + NOFLAG = 0 + IGNORECASE = _cy.CompileOption.CASELESS # Ignore case + UNICODE = _cy.CompileOption.UTF # Assume unicode "locale" + MULTILINE = _cy.CompileOption.MULTILINE # Make anchors look for newline + DOTALL = _cy.CompileOption.DOTALL # Make dot match newline + VERBOSE = _cy.CompileOption.EXTENDED # Ignore whitespace and comments + + # No corresponding flag in PCRE2, but is the opposite of `_cy.CompileOption.UCP` + ASCII = auto() # ASCII-only matching for character classes + + +NOFLAG = RegexFlag.NOFLAG +ASCII = A = RegexFlag.ASCII +IGNORECASE = I = RegexFlag.IGNORECASE +UNICODE = U = RegexFlag.UNICODE +MULTILINE = M = RegexFlag.MULTILINE +DOTALL = S = RegexFlag.DOTALL +VERBOSE = X = RegexFlag.VERBOSE + + +LibraryError = _cy.LibraryError +PatternError = error = _cy.PatternError + + +# ============================================================================ +# Internal Utilities + + +def _typeguard_strings(s): + if isinstance(s, str): + return str(s) + elif isinstance(s, (bytes, bytearray, memoryview)): + return bytes(s) + raise TypeError(f"Cannot process type {s}") + + +# ============================================================================ +# Top-Level Functions + + +def compile(pattern, flags=0, jit=True): + """ + Compile a regular expression pattern, returning a Pattern object. + """ + # Avoid recompilation if the pattern is already compiled with no option changes + if isinstance(pattern, Pattern): + if not flags == 0: + raise ValueError("Cannot process flags argument with a compiled pattern") + if pattern.jit == jit: + return pattern + # If options differ, extract the underlying string for recompilation + pattern = pattern.pattern + + pattern = _typeguard_strings(pattern) + flags = RegexFlag(flags) + + # Handle ASCII flag, defined as the disabling of the UCP PCRE2 option + options = flags & ~RegexFlag.ASCII + disabled_options = _cy.CompileOption.UCP if flags & RegexFlag.ASCII else 0 + + pcre2_code = _cy.compile(pattern, options, disabled_options) + if jit: + _cy.jit_compile(pcre2_code) + return Pattern(pcre2_code, pattern, flags, jit) + + +def search(pattern, string, flags=0, jit=True): + """ + Scan through `string` looking for a match to the pattern, returning a Match object, or None if + no match was found. + """ + return compile(pattern, flags, jit).search(string) + + +def match(pattern, string, flags=0, jit=True): + """ + Match the pattern at the start of `string`, returning a Match object, or None if no match was + found. + """ + return compile(pattern, flags, jit).match(string) + + +def fullmatch(pattern, string, flags=0, jit=True): + """ + Match the pattern to all of `string`, returning a Match object, or None if no match was found. + """ + return compile(pattern, flags, jit).fullmatch(string) + + +def finditer(pattern, string, flags=0, jit=True): + """ + Return an iterator of Match objects for each non-overlapping match in the string. + """ + return compile(pattern, flags, jit).finditer(string) + + +def findall(pattern, string, flags=0, jit=True): + """ + Return a list of all non-overlapping matches in `string`. + + If one or more capture groups are present, return a list of groups for each match. Empty + matches are included in the result. + """ + return compile(pattern, flags, jit).findall(string) + + +def split(pattern, string, maxsplit=0, flags=0, jit=True): + """ + Split the source string by the occurrences of the pattern, returning a list containing the + resulting substrings. + + If capture groups are used in pattern, then the text of all groups are also returned. If + `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is + returned as the final element of the list. + """ + return compile(pattern, flags, jit).split(string, maxsplit) + + +def subn(pattern, repl, string, count=0, flags=0, jit=True): + """ + Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the + leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`. + `number` is the number of substitutions that were made. + + `repl` can be either a string or a callable. If it is a callable, it's passed the Match object + and must return a replacement string to be used. + """ + return compile(pattern, flags, jit).subn(repl, string, count) + + +def sub(pattern, repl, string, count=0, flags=0, jit=True): + """ + Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern + in `string` by the replacement `repl`. + + `repl` can be either a string or a callable. If it is a callable, it's passed the Match object + and must return a replacement string to be used. + """ + return compile(pattern, flags, jit).sub(repl, string, count) + + +# ============================================================================ +# Pattern Object + + +class Pattern: + def __init__(self, pcre2_code, pattern, flags, jit): + if not isinstance(pcre2_code, _cy.PCRE2Code): + raise ValueError( + "PCRE2 code must be of type `_cy.PCRE2Code`. It is not recommended to instantiate " + "`Pattern` objects directly. Instead, use `pcre2.compile`." + ) + self._pcre2_code = pcre2_code + self.pattern = pattern + self.flags = flags + self.jit = jit + + def __getstate__(self): + state = self.__dict__.copy() + del state["_pcre2_code"] # Remove the unpicklable pointer + return state + + def __setstate__(self, state): + self.__dict__.update(state) + # Note that patterns are recompiled - and optionally JIT compiled - when unpickling + self._pcre2_code = _cy.compile(self.pattern, self.flags) + if self.jit: + _cy.jit_compile(self._pcre2_code) + + @property + @lru_cache(1) + def groups(self): + return _cy.pattern_capture_count(self._pcre2_code) + + @property + @lru_cache(1) + def groupindex(self): + groupindex = _cy.pattern_name_dict(self._pcre2_code) + return MappingProxyType(groupindex) + + def jit_compile(self): + """ + JIT compile the pattern, or nothing if the pattern is already JIT compiled. + """ + if not self.jit: + _cy.jit_compile(self._pcre2_code) + self.jit = True + + def _match(self, string, pos=0, endpos=maxsize, options=0): + string = _typeguard_strings(string) + pos = max(0, min(pos, len(string))) + endpos = max(0, min(endpos, len(string))) + match_data, match_byte_offset, match_options = _cy.match( + self._pcre2_code, string, endpos, pos, options + ) + if match_data: + return Match(match_data, self, string, pos, endpos, match_byte_offset, match_options) + return None + + def search(self, string, pos=0, endpos=maxsize): + """ + Scan through `string` looking for a match to the pattern, returning a Match object, or None + if no match was found. + """ + return self._match(string, pos, endpos) + + def match(self, string, pos=0, endpos=maxsize): + """ + Match the pattern at the start of `string`, returning a Match object, or None if no match + was found. + """ + return self._match(string, pos, endpos, options=_cy.MatchOption.ANCHORED) + + def fullmatch(self, string, pos=0, endpos=maxsize): + """ + Match the pattern to all of `string`, returning a Match object, or None if no match was + found. + """ + options = _cy.MatchOption.ANCHORED | _cy.MatchOption.ENDANCHORED + return self._match(string, pos, endpos, options=options) + + def finditer(self, string, pos=0, endpos=maxsize): + """ + Return an iterator of Match objects for each non-overlapping match in the string. + """ + string = _typeguard_strings(string) + pos = max(0, min(pos, len(string))) + endpos = max(0, min(endpos, len(string))) + for match_data, match_byte_offset, match_options in _cy.match_generator( + self._pcre2_code, string, endpos, pos + ): + yield Match(match_data, self, string, pos, endpos, match_byte_offset, match_options) + + def findall(self, string, pos=0, endpos=maxsize): + """ + Return a list of all non-overlapping matches in `string`. + + If one or more capture groups are present, return a list of groups for each match. Empty + matches are included in the result. + """ + string = _typeguard_strings(string) + empty = type(string)() + items = [] + for match in self.finditer(string, pos, endpos): + if not self.groups: + item = match.group() + elif self.groups == 1: + item = match.groups(default=empty)[0] + else: + item = match.groups(default=empty) + items.append(item) + return items + + def split(self, string, maxsplit=0): + """ + Split the source string by the occurrences of the pattern, returning a list containing the + resulting substrings. + + If capture groups are used in pattern, then the text of all groups are also returned. If + `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is + returned as the final element of the list. + """ + string = _typeguard_strings(string) + if maxsplit < 0: + return [string] + parts = [] + start = 0 + for match in islice(self.finditer(string), maxsplit or None): + parts.append(string[start : match.start()]) + parts.extend(map(match.__getitem__, range(1, self.groups + 1))) + start = match.end() + parts.append(string[start:]) + return parts + + def _suball(self, template, string): + template = _typeguard_strings(template) + string = _typeguard_strings(string) + options = _cy.SubstituteOption.GLOBAL | _cy.SubstituteOption.UNSET_EMPTY + byte_offset = 0 + return _cy.substitute(self._pcre2_code, template, string, byte_offset, options=options) + + def subn(self, repl, string, count=0): + """ + Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the + leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`. + `number` is the number of substitutions that were made. + + `repl` can be either a string or a callable. If it is a callable, it's passed the Match + object and must return a replacement string to be used. + """ + string = _typeguard_strings(string) + if count < 0: + return (string, 0) + + # Short circuit for global substitute + if count == 0 and not callable(repl): + return self._suball(repl, string) + + parts = [] + empty = type(string)() + + # Pure python needed to apply callback functions + if callable(repl): + start = 0 + numsubs = 0 + for match in islice(self.finditer(string), count or None): + parts.append(string[start : match.start()]) + parts.append(repl(match)) + start = match.end() + numsubs += 1 + parts.append(string[start:]) + empty = type(string)() + return empty.join(parts), numsubs + else: + # Iterate through matches to get index of last match + repl = _typeguard_strings(repl) + end = 0 + for match in islice(self.finditer(string), count or None): + end = match.end() + expanded, numsubs = self._suball(repl, string[:end]) + parts = [expanded, string[end:]] + + return empty.join(parts), numsubs + + def sub(self, repl, string, count=0): + """ + Return the string obtained by replacing the leftmost non-overlapping occurrences of the + pattern in `string` by the replacement `repl`. + + `repl` can be either a string or a callable. If it is a callable, it's passed the Match + object and must return a replacement string to be used. + """ + return self.subn(repl, string, count)[0] + + +# ============================================================================ +# Match Object + + +class Match: + def __init__(self, pcre2_match_data, re, string, pos, endpos, byte_offset, options): + if not isinstance(pcre2_match_data, _cy.PCRE2MatchData): + raise ValueError( + "PCRE2 match data must be of type `_cy.PCRE2MatchData`. It is not recommended to " + "instantiate `Match` objects directly. Instead, use `Pattern.match`." + ) + self._pcre2_match_data = pcre2_match_data + self.re = re + self.string = string + self.pos = pos + self.endpos = endpos + self._byte_offset = byte_offset + self._options = options + + def __repr__(self): + return ( + f"<{self.__class__.__module__}.{self.__class__.__qualname__} object; " + f"span={self.span()}, match={repr(self.group())}>" + ) + + def _groupguard(self, group): + if isinstance(group, int): + if not 0 <= group <= self.re.groups: + raise IndexError("No such group") + group_number = group + elif isinstance(group, str): + if group not in self.re.groupindex: + raise IndexError("no such group") + group_number = self.re.groupindex[group] + elif hasattr(group, "__index__"): + group_number = int(group.__index__()) + else: + raise IndexError("No such group") + return group_number + + def expand(self, template): + """ + Return the string obtained by substitution on the template string `template`. + """ + template = _typeguard_strings(template) + options = ( + self._options | _cy.SubstituteOption.REPLACEMENT_ONLY | _cy.SubstituteOption.UNSET_EMPTY + ) + res, _ = _cy.substitute( + self.re._pcre2_code, + template, + self.string, + self._byte_offset, + options=options, + match_data=self._pcre2_match_data, + ) + return res + + def span(self, group=0): + """ + Return the start and end of `group` as the tuple `(start, end)`. + + If `group` did not contribute to the match, `(-1, -1)` is returned. + """ + group_number = self._groupguard(group) + return _cy.substring_span_bynumber(self._pcre2_match_data, self.string, group_number) + + def __getitem__(self, group): + group_number = self._groupguard(group) + return _cy.substring_bynumber(self._pcre2_match_data, self.string, group_number) + + def group(self, *groups): + """ + Returns one or more subgroups of the match. + + If there is a single argument, the result is a single string. If there are multiple + arguments, the result is a tuple with one item per argument. Without arguments, the whole + match is returned. + """ + if not groups: + groups = (0,) + items = map(self.__getitem__, groups) + return next(items) if len(groups) == 1 else tuple(items) + + def groups(self, default=None): + """ + Return a tuple containing all the subgroups of the match. + """ + items = [] + for group in range(1, self.re.groups + 1): + item = self.__getitem__(group) + items.append(default if item is None else item) + return tuple(items) + + def groupdict(self, default=None): + """ + Return a dictionary mapping subgroup name to group number for all the named subgroups. + """ + items = [] + for group, index in self.re.groupindex.items(): + item = self.__getitem__(index) + items.append((group, default) if item is None else (group, item)) + return dict(items) + + def start(self, group=0): + """ + Return the start index of the substring matched by `group`. + """ + return self.span(group)[0] + + def end(self, group=0): + """ + Return the end index of the substring matched by `group`. + """ + return self.span(group)[1] + + @property + @lru_cache(1) + def lastindex(self): + max_end = -1 + max_group = None + # We look for the rightmost right parenthesis by keeping the first group that ends at + # max_end because that is the leftmost/outermost group when there are nested groups! + for group in range(1, self.re.groups + 1): + end = self.end(group) + if max_end < end: + max_end = end + max_group = group + return max_group + + @property + @lru_cache(1) + def lastgroup(self): + max_group = self.lastindex + if not max_group: + return None + for group, index in self.re.groupindex.items(): + if max_group == index: + return group + return None diff --git a/src/pcre2/_cy.pyx b/src/pcre2/_cy.pyx new file mode 100644 index 0000000..8993782 --- /dev/null +++ b/src/pcre2/_cy.pyx @@ -0,0 +1,590 @@ +# -*- coding:utf-8 -*- +# cython: profile=True + +from libc.stdint cimport uint8_t, uint32_t +from libc.stdlib cimport malloc, free +from libc.string cimport strlen +from cpython.unicode cimport PyUnicode_Check, PyUnicode_AsUTF8AndSize +from cpython.bytes cimport PyBytes_Check, PyBytes_AsStringAndSize + +from _libpcre2 cimport * + +from enum import IntFlag + + +__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}" + + +# ============================================================================ +# Pointer Proxies + +# Pointer wrappers to manage lifetime and expose to Python code +cdef class PCRE2Code: + cdef pcre2_code_t *ptr + cdef bint _pattern_is_str + + @staticmethod + cdef PCRE2Code from_ptr(pcre2_code_t *ptr, bint pattern_is_str): + """ Ownership of pointer is taken by the new instance """ + cdef PCRE2Code code + code = PCRE2Code.__new__(PCRE2Code) + code.ptr = ptr + code._pattern_is_str = pattern_is_str + return code + + def __init__(self, *args, **kwargs): + # Prevent accidental instantiation from normal Python code + raise TypeError(f"Cannot create 'PCRE2Code' instances") + + def __dealloc__(self): + if self.ptr is not NULL: + pcre2_code_free(self.ptr) + + +cdef class PCRE2MatchData: + cdef pcre2_match_data_t *ptr + + @staticmethod + cdef PCRE2MatchData from_ptr(pcre2_match_data_t *ptr): + """ Ownership of pointer is always taken by the new instance """ + cdef PCRE2MatchData match_data + match_data = PCRE2MatchData.__new__(PCRE2MatchData) + match_data.ptr = ptr + return match_data + + def __init__(self, *args, **kwargs): + # Prevent accidental instantiation from normal Python code + raise TypeError(f"Cannot create 'PCRE2MatchData' instances") + + def __dealloc__(self): + if self.ptr is not NULL: + pcre2_match_data_free(self.ptr) + + +# ============================================================================ +# Buffer Aquisition + +cdef (uint8_t *, size_t) as_sptr_and_size(object obj) except *: + cdef: + int rc + char *sptr = NULL + Py_ssize_t length = 0 + + # Encode unicode strings as UTF-8 buffers + if PyUnicode_Check(obj): + sptr = PyUnicode_AsUTF8AndSize(obj, &length) + assert(sptr is not NULL) # The function is supposed to throw on errors + elif PyBytes_Check(obj): + rc = PyBytes_AsStringAndSize(obj, &sptr, &length) + assert(rc == 0) + else: + raise ValueError("Only objects of type 'str' and 'bytes' are supported") + return sptr, length + + +# ============================================================================ +# Unicode Indexing + +cdef size_t idx_byte_to_char( + uint8_t *sptr, size_t byte_idx, size_t start_byte_idx = 0, size_t start_char_idx = 0 +): + cdef: + size_t cur_byte_idx = start_byte_idx + size_t cur_char_idx = start_char_idx + + while cur_byte_idx < byte_idx: + if (sptr[cur_byte_idx] & 0xC0) != 0x80: + cur_char_idx += 1 + cur_byte_idx += 1 + + return cur_char_idx + + +cdef size_t idx_char_to_byte( + uint8_t *sptr, size_t sptr_size, + size_t char_idx, + size_t start_byte_idx = 0, + size_t start_char_idx = 0, +): + cdef: + size_t cur_byte_idx = start_byte_idx + size_t cur_char_idx = start_char_idx + + if cur_char_idx < char_idx: + while cur_char_idx < char_idx: + if (sptr[cur_byte_idx] & 0xC0) != 0x80: + cur_char_idx += 1 + cur_byte_idx += 1 + + while cur_byte_idx < sptr_size and (sptr[cur_byte_idx] & 0xC0) == 0x80: + cur_byte_idx += 1 + + return cur_byte_idx + + +# ============================================================================ +# Exceptions + +class LibraryError(Exception): + def __init__(self, int errcode, object ctxmsg = None): + cdef: + uint8_t errmsg_sptr[120] + int rc + + rc = pcre2_get_error_message(errcode, errmsg_sptr, sizeof(errmsg_sptr)) + if rc == PCRE2_ERROR_NOMEMORY: + raise MemoryError + elif rc == PCRE2_ERROR_BADDATA: + raise ValueError(f"Unrecognized PCRE2 error code {errcode}") + elif rc < 0: + raise RuntimeError(f"Unhandled error code {rc} raised when getting error message") + + # For non-negative values, return code is the length of the message + errmsg = errmsg_sptr[:rc].decode("UTF-8") + if ctxmsg: + errmsg = f"{ctxmsg}; {errmsg}" + + super().__init__(errmsg) + self.msg = errmsg + self.code = errcode + + +class PatternError(LibraryError): + def __init__(self, int errcode, errpos): + super().__init__(errcode, ctxmsg=f"compilation failed at position {errpos}") + self.pos = errpos + + +cdef inline void raise_from_rc(int rc): + if rc < 0: + raise LibraryError(rc) + + +# ============================================================================ +# Pattern Compilation + + +class CompileOption(IntFlag): + CASELESS = PCRE2_CASELESS + DOTALL = PCRE2_DOTALL + MULTILINE = PCRE2_MULTILINE + EXTENDED = PCRE2_EXTENDED + + # Controls the input codec (whether the input bytes are read into characters by UTF-8 + # decoding). If the input pattern is a `str`, the default behaviour is UNICODE (and this cannot + # be unset). If the input pattern is a `bytes`, the default is ASCII/Latin-1 (one byte per + # character), but UNICODE sets this to UTF-8. + UTF = PCRE2_UTF + + # Controls the interpretation of character values. If characters are ASCII, then (for example) + # '\w' does not match values outside the range 0-127. If the input pattern is a compiled with + # the `UTF` option (whether `str` or `bytes`), the default behaviour is `UCP` enabled; this can + # be disabled by the `ASCII` flag in the Python wrapper + UCP = PCRE2_UCP + + +def compile(object pattern, uint32_t options = 0, disabled_options = 0): + cdef: + pcre2_code_t *code + uint8_t *patn_sptr + size_t patn_size + int rc + size_t errpos + + # Get views into object memory + patn_sptr, patn_size = as_sptr_and_size(pattern) + + # Lock out the use of \C which can lead to patterns matching within characters + options = options | PCRE2_NEVER_BACKSLASH_C + + # Set Python style '\uhhhh' syntax for literal unicode characters + options = options | PCRE2_ALT_BSUX + + # Default to UNICODE and UNICODE_PROPS for 'str' patterns and always disable these options for + # 'bytes' patterns + if PyUnicode_Check(pattern): + options = options | PCRE2_UTF + + # Always default to Unicode property support if we are interpreting strings as Unicode for both + # 'str' and 'bytes' objects + if options & PCRE2_UTF: + options = options | PCRE2_UCP + + # Allow for disabling any of the options set + options = options & ~disabled_options + + code = pcre2_compile(patn_sptr, patn_size, options, &rc, &errpos, NULL) + if code is NULL: + if PyUnicode_Check(pattern): + errpos = idx_byte_to_char(patn_sptr, errpos) + + # For some errors (e.g., unclosed groups) the whole pattern must be scanned and the error + # position returned is the length of the string. This means that the total range of error + # offset values is [0, length] inclusive + raise PatternError(rc, errpos) + + return PCRE2Code.from_ptr(code, PyUnicode_Check(pattern)) + + +def jit_compile(PCRE2Code code not None): + raise_from_rc(pcre2_jit_compile(code.ptr, PCRE2_JIT_COMPLETE)) + + +# ============================================================================ +# Information Extraction + +def pattern_is_utf(PCRE2Code code not None): + cdef uint32_t all_options + raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_ALLOPTIONS, &all_options)) + return bool(all_options & PCRE2_UTF) + + +def pattern_capture_count(PCRE2Code code not None): + cdef uint32_t capture_count + raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_CAPTURECOUNT, &capture_count)) + return int(capture_count) + + +def pattern_name_dict(PCRE2Code code not None): + cdef: + const uint8_t *name_table + const uint8_t *name + uint32_t name_count, name_entry_size + int idx, offset + object encoding + + # Get name table related information + raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMECOUNT, &name_count)) + raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size)) + raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMETABLE, &name_table)) + + encoding = "UTF-8" if pattern_is_utf(code) else "Latin-1" + + # Convert byte table to dictionary mapping group names to numbers + name_dict = {} + for idx in range(name_count): + # Name table is structured with first two bytes of name table contain group number followed + # by name string (which can be assumed to be in Latin-1 for non-unicode patterns). Default + # builds of PCRE2 only allow ASCII character names. + offset = idx * name_entry_size + name = &name_table[offset + 2] + group_name = name[:strlen(name)].decode(encoding) + group_number = int((name_table[offset] << 8) | name_table[offset + 1]) + name_dict[group_name] = group_number + + return name_dict + + +def substring_span_bynumber(PCRE2MatchData match_data not None, object subject, size_t number): + cdef: + size_t *ovector + uint8_t *subj_sptr + size_t subj_size + int rc + size_t start + size_t end + + # Get views into object memory + subj_sptr, subj_size = as_sptr_and_size(subject) + + # Only perform offset lookup if group has been set + rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL) + if rc == 0: + ovector = pcre2_get_ovector_pointer(match_data.ptr) + start = ovector[2 * number] + end = ovector[2 * number + 1] + + if PyUnicode_Check(subject): + start = idx_byte_to_char(subj_sptr, start) + end = idx_byte_to_char(subj_sptr, end) + + return (start, end) + + return (-1, -1) + + +def substring_bynumber(PCRE2MatchData match_data not None, object subject, size_t number): + cdef: + size_t *ovector + uint8_t *subj_sptr + size_t subj_size + int rc + size_t start + size_t end + + # Get views into object memory + subj_sptr, subj_size = as_sptr_and_size(subject) + + # Only perform offset lookup if group has been set + rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL) + if rc == PCRE2_ERROR_UNSET: + return None + raise_from_rc(rc) + + ovector = pcre2_get_ovector_pointer(match_data.ptr) + start = ovector[2 * number] + end = ovector[2 * number + 1] + + res_obj = bytes(subj_sptr[start:end]) + if PyUnicode_Check(subject): + res_obj = res_obj.decode("UTF-8") + return res_obj + + +# ============================================================================ +# Matching + +class MatchOption(IntFlag): + ANCHORED = PCRE2_ANCHORED + ENDANCHORED = PCRE2_ENDANCHORED + +cdef pcre2_match_data_t * _pcre2_match_data_create_from_pattern( + const pcre2_code_t *code, pcre2_general_context_t *gcontext +): + return pcre2_match_data_create_from_pattern(code, gcontext) + +cdef int _pcre2_match( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext +): + return pcre2_match(code, subject, length, startoffset, options, match_data, mcontext) + +cdef PCRE2MatchData _match( + PCRE2Code code, + uint8_t *subj_sptr, + size_t byte_length, + size_t byte_offset, + uint32_t options, +) except *: + cdef: + pcre2_match_data_t *match_data_ptr + int rc + + # Allocate memory for match data, returning NULL if the memory could not be obtained + match_data_ptr = _pcre2_match_data_create_from_pattern(code.ptr, NULL) + if match_data_ptr is NULL: + raise MemoryError + + # Attempt match of pattern onto the subject + rc = _pcre2_match(code.ptr, subj_sptr, byte_length, byte_offset, options, match_data_ptr, NULL) + if rc == PCRE2_ERROR_NOMATCH: + return None + raise_from_rc(rc) + + return PCRE2MatchData.from_ptr(match_data_ptr) + +def match( + PCRE2Code code not None, + object subject, + size_t length, # length & offset in logical (index) units + size_t offset, + uint32_t options = 0, +): + cdef: + uint8_t *subj_sptr + size_t subj_size + + # Although the error message says "cannot use..." there would actually be nothing wrong at all + # with removing this block and allowing it. It's simply a matter of policy and clarity, and to + # match Python's re module. + if code._pattern_is_str ^ PyUnicode_Check(subject): + if code._pattern_is_str: + raise TypeError("Cannot use a string pattern on a bytes-like object") + else: + raise TypeError("Cannot use a bytes pattern on a string-like object") + + # Get views into object memory + subj_sptr, subj_size = as_sptr_and_size(subject) + + if PyUnicode_Check(subject): + # Disable UTF-8 encoding checks for improved performance + options |= PCRE2_NO_UTF_CHECK + + length = ( + subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length) + ) + offset = ( + subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset) + ) + + return _match(code, subj_sptr, length, offset, options), offset, options + + +def match_generator( + PCRE2Code code not None, + object subject, + size_t length, # length & offset in logical (index) units + size_t offset, +): + cdef: + uint32_t starting_options = 0 + uint32_t state_options = 0 + uint32_t match_options + size_t byte_length = length + size_t byte_offset = offset + size_t match_byte_offset + + # Although the error message says "cannot use..." there would actually be nothing wrong at all + # with removing this block and allowing it. It's simply a matter of policy and clarity, and to + # match Python's re module. + if code._pattern_is_str ^ PyUnicode_Check(subject): + if code._pattern_is_str: + raise TypeError("Cannot use a string pattern on a bytes-like object") + else: + raise TypeError("Cannot use a bytes pattern on a string-like object") + + # Get views into object memory + subj_sptr, subj_size = as_sptr_and_size(subject) + + if PyUnicode_Check(subject): + # Disable UTF-8 encoding checks for improved performance + starting_options |= PCRE2_NO_UTF_CHECK + + byte_length = ( + subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length) + ) + byte_offset = ( + subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset) + ) + + while byte_offset <= byte_length: + match_options = starting_options | state_options + match_byte_offset = byte_offset + match_data = _match(code, subj_sptr, byte_length, match_byte_offset, match_options) + if not match_data: + break + + else: + ovector = pcre2_get_ovector_pointer(match_data.ptr) + + assert(match_byte_offset <= ovector[0] and ovector[0] <= ovector[1]) + assert(ovector[1] > match_byte_offset or state_options == 0) + + if ovector[0] == ovector[1]: + # If the matched string is empty ensure the next match makes progress + state_options = PCRE2_NOTEMPTY_ATSTART + else: + state_options = 0 # Reset options so empty strings can match at next offset + + byte_offset = ovector[1] + + yield match_data, match_byte_offset, match_options + + # No need to re-match after an empty match at the end (it will just find nothing) + if ovector[0] == ovector[1] and ovector[1] >= byte_length: + break + + +# ============================================================================ +# Substitution + + +class SubstituteOption(IntFlag): + GLOBAL = PCRE2_SUBSTITUTE_GLOBAL + UNSET_EMPTY = PCRE2_SUBSTITUTE_UNSET_EMPTY + REPLACEMENT_ONLY = PCRE2_SUBSTITUTE_REPLACEMENT_ONLY + +def substitute( + PCRE2Code code not None, + object replacement, + object subject, + size_t byte_offset, # in bytes - unlike _cy.match() + uint32_t options = 0, + PCRE2MatchData match_data = None, +): + cdef: + int rc + pcre2_match_data_t *match_data_ptr = NULL + uint8_t *subj_sptr + uint8_t *repl_sptr + uint8_t *res_sptr + size_t subj_size, repl_size, res_size + + # Always compute the needed length if there is any overflow + options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + + # Add support for backslash escape characters and Python substitution forms + options |= PCRE2_SUBSTITUTE_EXTENDED + + # Although the error message says "cannot use..." there would actually be nothing wrong at all + # with removing this block and allowing it. It's simply a matter of policy and clarity, and to + # match Python's re module. + if code._pattern_is_str ^ PyUnicode_Check(subject): + if code._pattern_is_str: + raise TypeError("Cannot use a string pattern on a bytes-like object") + else: + raise TypeError("Cannot use a bytes pattern on a string-like object") + + # Similarly, ensure that there is a match between the type of subject and replacement. + # + # Unlike the check that pattern and subject match, this one is cannot be simply removed. We + # pass in the PCRE2_NO_UTF_CHECK flag based on the type of subject, and that flag also affects + # the interpretation of replacement. So, we require a check that the replacement string is + # valid UTF-8, if the subject is a 'str' object (note that we could do this either by enforcing + # that replacement is a 'str', or by we could allow bytes as well if we do the decode here to + # validate it). + # + # For policy and clarity, we additionally forbid using a 'str' replacement with a 'bytes' + # subject, although there is no issue with that combination. + if PyUnicode_Check(subject) ^ PyUnicode_Check(replacement): + if PyUnicode_Check(subject): + raise TypeError("Cannot use a string subject with a bytes-like template") + else: + raise TypeError("Cannot use a bytes subject with a string-like template") + + # Get views into object memory + repl_sptr, repl_size = as_sptr_and_size(replacement) + subj_sptr, subj_size = as_sptr_and_size(subject) + + # Disable UTF-8 encoding checks for improved performance + if match_data is None and PyUnicode_Check(subject): + options |= PCRE2_NO_UTF_CHECK + + if match_data is not None: + match_data_ptr = match_data.ptr + options |= PCRE2_SUBSTITUTE_MATCHED + + # Make simple attempt at guess for required memory, unless match has already been made + res_size = subj_size + (subj_size // 2) if match_data is None else 0 + res_sptr = malloc(res_size * sizeof(uint8_t)) + try: + rc = pcre2_substitute( + code.ptr, + subj_sptr, subj_size, + byte_offset, + options, + match_data_ptr, + NULL, + repl_sptr, repl_size, + res_sptr, &res_size, + ) + # Reattempt substitution if no memory, now with required size of buffer known + if rc == PCRE2_ERROR_NOMEMORY: + free(res_sptr) + res_sptr = malloc(res_size * sizeof(uint8_t)) + rc = pcre2_substitute( + code.ptr, + subj_sptr, subj_size, + byte_offset, + options, + match_data_ptr, + NULL, + repl_sptr, repl_size, + res_sptr, &res_size, + ) + raise_from_rc(rc) + + # Non-error return code contains the number of substitutions made + res_obj = bytes(res_sptr[:res_size]) + if PyUnicode_Check(subject): + # Match the type of the return object to the input object + res_obj = res_obj.decode("UTF-8") + return (res_obj, rc) + + finally: + free(res_sptr) diff --git a/src/pcre2/_libpcre2.pxd b/src/pcre2/_libpcre2.pxd new file mode 100755 index 0000000..5f299c7 --- /dev/null +++ b/src/pcre2/_libpcre2.pxd @@ -0,0 +1,500 @@ +# -*- coding:utf-8 -*- + +from libc.stdint cimport uint8_t, uint32_t, int32_t + + +cdef extern from "pcre2.h": + cdef unsigned int PCRE2_MAJOR + cdef unsigned int PCRE2_MINOR + + # The following option bits can be passed to pcre2_compile(), + # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the + # function to which it is passed. Put these bits at the most significant + # end of the options word so others can be added next to them. + cdef unsigned int PCRE2_ANCHORED + cdef unsigned int PCRE2_NO_UTF_CHECK + cdef unsigned int PCRE2_ENDANCHORED + + # The following option bits can be passed only to pcre2_compile(). However, + # they may affect compilation, JIT compilation, and/or interpretive + # execution. The following tags indicate which: + # C alters what is compiled by pcre2_compile() + # J alters what is compiled by pcre2_jit_compile() + # M is inspected during pcre2_match() execution + # D is inspected during pcre2_dfa_match() execution + cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS # C + cdef unsigned int PCRE2_ALT_BSUX # C + cdef unsigned int PCRE2_AUTO_CALLOUT # C + cdef unsigned int PCRE2_CASELESS # C + cdef unsigned int PCRE2_DOLLAR_ENDONLY # J M D + cdef unsigned int PCRE2_DOTALL # C + cdef unsigned int PCRE2_DUPNAMES # C + cdef unsigned int PCRE2_EXTENDED # C + cdef unsigned int PCRE2_FIRSTLINE # J M D + cdef unsigned int PCRE2_MATCH_UNSET_BACKREF # C J M + cdef unsigned int PCRE2_MULTILINE # C + cdef unsigned int PCRE2_NEVER_UCP # C + cdef unsigned int PCRE2_NEVER_UTF # C + cdef unsigned int PCRE2_NO_AUTO_CAPTURE # C + cdef unsigned int PCRE2_NO_AUTO_POSSESS # C + cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR # C + cdef unsigned int PCRE2_NO_START_OPTIMIZE # J M D + cdef unsigned int PCRE2_UCP # C J M D + cdef unsigned int PCRE2_UNGREEDY # C + cdef unsigned int PCRE2_UTF # C J M D + cdef unsigned int PCRE2_NEVER_BACKSLASH_C # C + cdef unsigned int PCRE2_ALT_CIRCUMFLEX # J M D + cdef unsigned int PCRE2_ALT_VERBNAMES # C + cdef unsigned int PCRE2_USE_OFFSET_LIMIT # J M D + cdef unsigned int PCRE2_EXTENDED_MORE # C + cdef unsigned int PCRE2_LITERAL # C + cdef unsigned int PCRE2_MATCH_INVALID_UTF # J M D + + # An additional compile options word is available in the compile context. + cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES # C + cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL # C + cdef unsigned int PCRE2_EXTRA_MATCH_WORD # C + cdef unsigned int PCRE2_EXTRA_MATCH_LINE # C + cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF # C + cdef unsigned int PCRE2_EXTRA_ALT_BSUX # C + cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK # C + + # These are for pcre2_jit_compile(). + cdef unsigned int PCRE2_JIT_COMPLETE # For full matching. + cdef unsigned int PCRE2_JIT_PARTIAL_SOFT + cdef unsigned int PCRE2_JIT_PARTIAL_HARD + cdef unsigned int PCRE2_JIT_INVALID_UTF + + # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and + # pcre2_substitute(). Some are allowed only for one of the functions, and + # in these cases it is noted below. Note that PCRE2_ANCHORED, + # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these + # functions (though pcre2_jit_match() ignores the latter since it bypasses + # all sanity checks). + cdef unsigned int PCRE2_NOTBOL + cdef unsigned int PCRE2_NOTEOL + cdef unsigned int PCRE2_NOTEMPTY # ) These two must be kept + cdef unsigned int PCRE2_NOTEMPTY_ATSTART # ) adjacent to each other. + cdef unsigned int PCRE2_PARTIAL_SOFT + cdef unsigned int PCRE2_PARTIAL_HARD + cdef unsigned int PCRE2_DFA_RESTART # pcre2_dfa_match() only + cdef unsigned int PCRE2_DFA_SHORTEST # pcre2_dfa_match() only + cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH # pcre2_substitute() only + cdef unsigned int PCRE2_NO_JIT # Not for pcre2_dfa_match() + cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT + cdef unsigned int PCRE2_SUBSTITUTE_LITERAL # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_MATCHED # pcre2_substitute() only + cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY # pcre2_substitute() only + + # Options for pcre2_pattern_convert(). + cdef unsigned int PCRE2_CONVERT_UTF + cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK + cdef unsigned int PCRE2_CONVERT_POSIX_BASIC + cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED + cdef unsigned int PCRE2_CONVERT_GLOB + cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR + + # Newline and \R settings, for use in compile contexts. The newline values + # must be kept in step with values set in config.h and both sets must all + # be greater than zero. + cdef int PCRE2_NEWLINE_CR + cdef int PCRE2_NEWLINE_LF + cdef int PCRE2_NEWLINE_CRLF + cdef int PCRE2_NEWLINE_ANY + cdef int PCRE2_NEWLINE_ANYCRLF + cdef int PCRE2_NEWLINE_NUL + + cdef int PCRE2_BSR_UNICODE + cdef int PCRE2_BSR_ANYCRLF + + # Error codes for pcre2_compile(). Some of these are also used by + # pcre2_pattern_convert(). + cdef int PCRE2_ERROR_END_BACKSLASH + cdef int PCRE2_ERROR_END_BACKSLASH_C + cdef int PCRE2_ERROR_UNKNOWN_ESCAPE + cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER + cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG + cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET + cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS + cdef int PCRE2_ERROR_CLASS_RANGE_ORDER + cdef int PCRE2_ERROR_QUANTIFIER_INVALID + cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT + cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY + cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS + cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING + cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS + cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE + cdef int PCRE2_ERROR_NULL_PATTERN + cdef int PCRE2_ERROR_BAD_OPTIONS + cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING + cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP + cdef int PCRE2_ERROR_PATTERN_TOO_LARGE + cdef int PCRE2_ERROR_HEAP_FAILED + cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS + cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW + cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING + cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH + cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE + cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES + cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED + cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE + cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS + cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR + cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED + cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK + cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG + cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED + cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C + cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE + cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG + cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING + cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB + cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P + cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR + cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME + cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME + cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE + cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY + cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY + cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG + cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS + cdef int PCRE2_ERROR_CLASS_INVALID_RANGE + cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG + cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE + cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN + cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES + cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE + cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE + cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX + cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING + # Error 159 is obsolete and should now never occur + cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED + cdef int PCRE2_ERROR_VERB_UNKNOWN + cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG + cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED + cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW + cdef int PCRE2_ERROR_INVALID_OCTAL + cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH + cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT + cdef int PCRE2_ERROR_INVALID_HEXADECIMAL + cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX + cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS + cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS + cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG + cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT + cdef int PCRE2_ERROR_UTF_IS_DISABLED + cdef int PCRE2_ERROR_UCP_IS_DISABLED + cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG + cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG + cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS + cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS + cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER + cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER + cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED + cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP + cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED + cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED + cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG + cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE + cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP + cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16 + cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS + cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE + cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS + cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN + cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE + cdef int PCRE2_ERROR_TOO_MANY_CAPTURES + cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED + cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND + + # "Expected" matching error codes: no match and partial match. + cdef int PCRE2_ERROR_NOMATCH + cdef int PCRE2_ERROR_PARTIAL + + # Error codes for UTF-8 validity checks. + cdef int PCRE2_ERROR_UTF8_ERR1 + cdef int PCRE2_ERROR_UTF8_ERR2 + cdef int PCRE2_ERROR_UTF8_ERR3 + cdef int PCRE2_ERROR_UTF8_ERR4 + cdef int PCRE2_ERROR_UTF8_ERR5 + cdef int PCRE2_ERROR_UTF8_ERR6 + cdef int PCRE2_ERROR_UTF8_ERR7 + cdef int PCRE2_ERROR_UTF8_ERR8 + cdef int PCRE2_ERROR_UTF8_ERR9 + cdef int PCRE2_ERROR_UTF8_ERR10 + cdef int PCRE2_ERROR_UTF8_ERR11 + cdef int PCRE2_ERROR_UTF8_ERR12 + cdef int PCRE2_ERROR_UTF8_ERR13 + cdef int PCRE2_ERROR_UTF8_ERR14 + cdef int PCRE2_ERROR_UTF8_ERR15 + cdef int PCRE2_ERROR_UTF8_ERR16 + cdef int PCRE2_ERROR_UTF8_ERR17 + cdef int PCRE2_ERROR_UTF8_ERR18 + cdef int PCRE2_ERROR_UTF8_ERR19 + cdef int PCRE2_ERROR_UTF8_ERR20 + cdef int PCRE2_ERROR_UTF8_ERR21 + + # Error codes for UTF-16 validity checks. + cdef int PCRE2_ERROR_UTF16_ERR1 + cdef int PCRE2_ERROR_UTF16_ERR2 + cdef int PCRE2_ERROR_UTF16_ERR3 + + # Error codes for UTF-32 validity checks. + cdef int PCRE2_ERROR_UTF32_ERR1 + cdef int PCRE2_ERROR_UTF32_ERR2 + + # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction + # functions, context functions, and serializing functions. They are in + # numerical order. Originally they were in alphabetical order too, but now + # that PCRE2 is released, the numbers must not be changed. + cdef int PCRE2_ERROR_BADDATA + cdef int PCRE2_ERROR_MIXEDTABLES # Name was changed. + cdef int PCRE2_ERROR_BADMAGIC + cdef int PCRE2_ERROR_BADMODE + cdef int PCRE2_ERROR_BADOFFSET + cdef int PCRE2_ERROR_BADOPTION + cdef int PCRE2_ERROR_BADREPLACEMENT + cdef int PCRE2_ERROR_BADUTFOFFSET + cdef int PCRE2_ERROR_CALLOUT # Never used by PCRE2 itself. + cdef int PCRE2_ERROR_DFA_BADRESTART + cdef int PCRE2_ERROR_DFA_RECURSE + cdef int PCRE2_ERROR_DFA_UCOND + cdef int PCRE2_ERROR_DFA_UFUNC + cdef int PCRE2_ERROR_DFA_UITEM + cdef int PCRE2_ERROR_DFA_WSSIZE + cdef int PCRE2_ERROR_INTERNAL + cdef int PCRE2_ERROR_JIT_BADOPTION + cdef int PCRE2_ERROR_JIT_STACKLIMIT + cdef int PCRE2_ERROR_MATCHLIMIT + cdef int PCRE2_ERROR_NOMEMORY + cdef int PCRE2_ERROR_NOSUBSTRING + cdef int PCRE2_ERROR_NOUNIQUESUBSTRING + cdef int PCRE2_ERROR_NULL + cdef int PCRE2_ERROR_RECURSELOOP + cdef int PCRE2_ERROR_DEPTHLIMIT + cdef int PCRE2_ERROR_RECURSIONLIMIT # Obsolete synonym. + cdef int PCRE2_ERROR_UNAVAILABLE + cdef int PCRE2_ERROR_UNSET + cdef int PCRE2_ERROR_BADOFFSETLIMIT + cdef int PCRE2_ERROR_BADREPESCAPE + cdef int PCRE2_ERROR_REPMISSINGBRACE + cdef int PCRE2_ERROR_BADSUBSTITUTION + cdef int PCRE2_ERROR_BADSUBSPATTERN + cdef int PCRE2_ERROR_TOOMANYREPLACE + cdef int PCRE2_ERROR_BADSERIALIZEDDATA + cdef int PCRE2_ERROR_HEAPLIMIT + cdef int PCRE2_ERROR_CONVERT_SYNTAX + cdef int PCRE2_ERROR_INTERNAL_DUPMATCH + cdef int PCRE2_ERROR_DFA_UINVALID_UTF + + # Request types for pcre2_pattern_info(). + cdef int PCRE2_INFO_ALLOPTIONS + cdef int PCRE2_INFO_ARGOPTIONS + cdef int PCRE2_INFO_BACKREFMAX + cdef int PCRE2_INFO_BSR + cdef int PCRE2_INFO_CAPTURECOUNT + cdef int PCRE2_INFO_FIRSTCODEUNIT + cdef int PCRE2_INFO_FIRSTCODETYPE + cdef int PCRE2_INFO_FIRSTBITMAP + cdef int PCRE2_INFO_HASCRORLF + cdef int PCRE2_INFO_JCHANGED + cdef int PCRE2_INFO_JITSIZE + cdef int PCRE2_INFO_LASTCODEUNIT + cdef int PCRE2_INFO_LASTCODETYPE + cdef int PCRE2_INFO_MATCHEMPTY + cdef int PCRE2_INFO_MATCHLIMIT + cdef int PCRE2_INFO_MAXLOOKBEHIND + cdef int PCRE2_INFO_MINLENGTH + cdef int PCRE2_INFO_NAMECOUNT + cdef int PCRE2_INFO_NAMEENTRYSIZE + cdef int PCRE2_INFO_NAMETABLE + cdef int PCRE2_INFO_NEWLINE + cdef int PCRE2_INFO_DEPTHLIMIT + cdef int PCRE2_INFO_RECURSIONLIMIT # Obsolete synonym + cdef int PCRE2_INFO_SIZE + cdef int PCRE2_INFO_HASBACKSLASHC + cdef int PCRE2_INFO_FRAMESIZE + cdef int PCRE2_INFO_HEAPLIMIT + cdef int PCRE2_INFO_EXTRAOPTIONS + + # Request types for pcre2_config(). + cdef int PCRE2_CONFIG_BSR + cdef int PCRE2_CONFIG_JIT + cdef int PCRE2_CONFIG_JITTARGET + cdef int PCRE2_CONFIG_LINKSIZE + cdef int PCRE2_CONFIG_MATCHLIMIT + cdef int PCRE2_CONFIG_NEWLINE + cdef int PCRE2_CONFIG_PARENSLIMIT + cdef int PCRE2_CONFIG_DEPTHLIMIT + cdef int PCRE2_CONFIG_RECURSIONLIMIT # Obsolete synonym + cdef int PCRE2_CONFIG_STACKRECURSE # Obsolete + cdef int PCRE2_CONFIG_UNICODE + cdef int PCRE2_CONFIG_UNICODE_VERSION + cdef int PCRE2_CONFIG_VERSION + cdef int PCRE2_CONFIG_HEAPLIMIT + cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C + cdef int PCRE2_CONFIG_COMPILED_WIDTHS + cdef int PCRE2_CONFIG_TABLES_LENGTH + + + # Opaque handles for PCRE2 defined structs. + ctypedef struct pcre2_code_t "pcre2_code": + pass + ctypedef struct pcre2_match_data_t "pcre2_match_data": + pass + ctypedef struct pcre2_general_context_t "pcre2_general_context": + pass + ctypedef struct pcre2_compile_context_t "pcre2_compile_context": + pass + ctypedef struct pcre2_match_context_t "pcre2_match_context": + pass + + # Basic string definition. Note that this assumes PCRE2 in compiled to + # support 8-bit strings. + ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR" + + # Error handling functions. + int pcre2_get_error_message( + int errorcode, + uint8_t *buffer, + size_t bufflen + ) + + # Pattern compilation functions. + pcre2_code_t * pcre2_compile( + pcre2_sptr_t pattern, + size_t length, + uint32_t options, + int *errorcode, + size_t *erroroffset, + pcre2_compile_context_t *ccontext + ) + + int pcre2_jit_compile( + pcre2_code_t *code, + uint32_t options + ) + + + void pcre2_code_free(pcre2_code_t *code) + + # Information on compiled pattern. + int pcre2_pattern_info( + const pcre2_code_t *code, + uint32_t what, + void *where + ) + + int pcre2_substring_number_from_name( + const pcre2_code_t *code, + pcre2_sptr_t name + ) + + # Matching and match data functions. + pcre2_match_data_t * pcre2_match_data_create( + uint32_t ovecsize, + pcre2_general_context_t *gcontext + ) + + pcre2_match_data_t * pcre2_match_data_create_from_pattern( + const pcre2_code_t *code, + pcre2_general_context_t *gcontext + ) + + int pcre2_match( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext + ) + int pcre2_jit_match( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext + ) + + void pcre2_match_data_free(pcre2_match_data_t *match_data) + + uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data) + + size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data) + + int pcre2_substring_nametable_scan( + const pcre2_code_t *code, + pcre2_sptr_t name, + pcre2_sptr_t *first, + pcre2_sptr_t *last + ) + + # String extraction from match data blocks. + int pcre2_substring_length_byname( + pcre2_match_data_t *match_data, + pcre2_sptr_t name, + size_t *bufflen + ) + + int pcre2_substring_get_byname( + pcre2_match_data_t *match_data, + pcre2_sptr_t name, + uint8_t **bufferptr, + size_t *bufflen + ) + + int pcre2_substring_length_bynumber( + pcre2_match_data_t *match_data, + uint32_t number, + size_t *bufflen + ) + + int pcre2_substring_get_bynumber( + pcre2_match_data_t *match_data, + uint32_t number, + uint8_t **bufferptr, + size_t *bufflen + ) + + # Substitution. + int pcre2_substitute( + const pcre2_code_t *code, + pcre2_sptr_t subject, + size_t length, + size_t startoffset, + uint32_t options, + pcre2_match_data_t *match_data, + pcre2_match_context_t *mcontext, + pcre2_sptr_t replacement, + size_t rlength, + uint8_t *outputbuffer, + size_t *outlengthptr + ) + + # Serialization. + int32_t pcre2_serialize_decode( + pcre2_code_t **codes, + int32_t number_of_codes, + const uint8_t *code_bytes, + pcre2_general_context_t *gcontex + ) + int32_t pcre2_serialize_encode( + pcre2_code_t **codes, + int32_t number_of_codes, + uint8_t **serialized_bytes, + size_t *serialized_size, + pcre2_general_context_t *gcontex + ) + void pcre2_serialize_free(uint8_t *bytes) diff --git a/tests/test_groups.py b/tests/test_groups.py new file mode 100644 index 0000000..a7a8c28 --- /dev/null +++ b/tests/test_groups.py @@ -0,0 +1,14 @@ +import pytest +import pcre2 + + +def test_match_groups(): + assert pcre2.match("a", "a").groups() == () + assert pcre2.match("(a)", "a").groups() == ("a",) + + assert pcre2.match(b"a", b"a").groups() == () + assert pcre2.match(b"(a)", b"a").groups() == (b"a",) + + for a in ("\xe0", "\u0430", "\U0001d49c"): + assert pcre2.match(a, a).groups() == () + assert pcre2.match("(%s)" % a, a).groups() == (a,) diff --git a/tests/test_match.py b/tests/test_match.py new file mode 100644 index 0000000..b083776 --- /dev/null +++ b/tests/test_match.py @@ -0,0 +1,58 @@ +import pytest +import pcre2 +import re + + +# All tests should match successfully. +test_data_match_bounds = [ + (b".*", "aba•ba••ba•••b".encode(), 0, 0, None, 0, 0, 26), + (".*", "aba•ba••ba•••b", 0, 0, None, 0, 0, 14), + (r"\w+", "b•", 0, 0, None, 0, 0, 1), + (r"\w+", "b•", 0, None, None, 0, 0, 1), + (r"\w+", "•b", 0, 1, None, 0, 1, 2), + (r"\w+", "•bc", 0, 2, None, 0, 2, 3), + (r"\w+", "•bc", 0, 1, 2, 0, 1, 2), +] + + +@pytest.mark.parametrize("pattern,subject,flags,pos,endpos,group,start,end", test_data_match_bounds) +def test_match_bounds(pattern, subject, flags, pos, endpos, group, start, end): + p = pcre2.compile(pattern, flags=flags) + kwargs = {} + if endpos is not None: + kwargs["endpos"] = endpos + if pos is not None: + kwargs["pos"] = pos + m = p.match(subject, **kwargs) + assert (m.start(group), m.end(group)) == (start, end) + if endpos is not None: + assert m.endpos == endpos + if pos is not None: + assert m.pos == pos + + +test_data_match_substring = [ + (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()), + (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"), +] + + +@pytest.mark.parametrize("pattern,subject,flags,pos,substring", test_data_match_substring) +def test_match_substring(pattern, subject, flags, pos, substring): + p = pcre2.compile(pattern, flags=flags) + m = p.match(subject, pos=pos) + assert m[0] == substring + + +test_data_match_expand = [ + (b"[abc]+", b"$0", b"dabacbaccbacccb", 0, 0, b"abacbaccbacccb"), + ("[abc]+", "$0", "dabacbaccbacccb", 0, 0, "abacbaccbacccb"), + ("[abc]+", "$0", "dabacbaccbacccb", 0, 10, "acccb"), +] + + +@pytest.mark.parametrize("pattern,replacement,subject,flags,pos,result", test_data_match_expand) +def test_match_expand(pattern, replacement, subject, flags, pos, result): + p = pcre2.compile(pattern, flags=flags) + m = p.search(subject, pos=pos) + assert m.expand(replacement) == result diff --git a/tests/test_pattern.py b/tests/test_pattern.py new file mode 100644 index 0000000..c3c67b2 --- /dev/null +++ b/tests/test_pattern.py @@ -0,0 +1,237 @@ +import pytest +import pcre2 +from pcre2._cy import LibraryError + + +test_data_pattern_compile_success = [ + (b"a+b+c*d*", 0, "SUCCESS"), + (b"(?a+b+)c*d*", 0, "SUCCESS"), + (b"(?a+b+))c*d*", 0, "COMPILE_ERROR"), + ("å+∫+ç*∂*".encode(), 0, "SUCCESS"), + ("a+b+c*d*", 0, "SUCCESS"), + ("(?a+b+)c*d*", 0, "SUCCESS"), + ("(?a+b+))c*d*", 0, "COMPILE_ERROR"), + ("(?<a+b+)c*d*", 0, "COMPILE_ERROR"), + ("(?a+b+)c*d*(?a+b+)", 0, "COMPILE_ERROR"), + ("å+∫+ç*∂*", 0, "SUCCESS"), + ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"), +] + + +@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success) +def test_pattern_compile_success(pattern, flags, return_code): + try: + p = pcre2.compile(pattern, flags=flags, jit=False) + rc = "SUCCESS" + assert not p.jit + except pcre2.PatternError: + rc = "COMPILE_ERROR" + except pcre2.LibraryError: + rc = "LIB_ERROR" + assert rc == return_code + + +@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success) +def test_pattern_jit_compile_success(pattern, flags, return_code): + try: + p = pcre2.compile(pattern, flags=flags, jit=True) + rc = "SUCCESS" + assert p.jit + except pcre2.PatternError: + rc = "COMPILE_ERROR" + except pcre2.LibraryError: + rc = "LIB_ERROR" + assert rc == return_code + + +test_data_pattern_groupindex = [ + (b"(?a+b+)c*d*", 0, {"foo": 1}), + ("(?a+b+)c*d*", 0, {"foo": 1}), + ("(?<ƒøø>a+b+)c*d*", 0, {"ƒøø": 1}), + ("(?a+b+)c*d*(?a+b+)", 0, {"foo": 1, "bar": 2}), + ("(?a+b+)c*(.+)d*(?a+b+)", 0, {"foo": 1, "bar": 3}), +] + + +@pytest.mark.parametrize("pattern,flags,groupindex", test_data_pattern_groupindex) +def test_pattern_groupindex(pattern, flags, groupindex): + p = pcre2.compile(pattern, flags=flags) + assert p.groupindex == groupindex + + +test_data_pattern_match_success = [ + (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"), + (".*", "abacbaccbacccb", 0, 0, "SUCCESS"), + ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"), + ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"), + ("ab", "abacbaccbacccb", 0, 2, "UNMATCHED"), + ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"), +] + + +@pytest.mark.parametrize("pattern,subject,flags,pos,return_code", test_data_pattern_match_success) +def test_pattern_match_success(pattern, subject, flags, pos, return_code): + p = pcre2.compile(pattern, flags=flags) + try: + m = p.search(subject, pos=pos) + rc = "SUCCESS" if m else "UNMATCHED" + except LibraryError as e: + rc = "LIB_ERROR" + assert rc == return_code + + +test_data_pattern_scan_length = [ + (b".+", b"abacbaccbacccb", 0, 1), + (b".*", b"abacbaccbacccb", 0, 2), + (".+", "abacbaccbacccb", 0, 1), + (".*", "abacbaccbacccb", 0, 2), + ("[abc]*", "dabacbaccbacccb", 0, 3), + ("ac{2,}b", "abacbaccbacccb", 0, 2), + ("a•{2,}b", "aba•ba••ba•••b", 0, 2), + ("a•*b", "aba•ba••ba•••b", 0, 4), + ("ab", "abacbaccbacccb", 2, 0), +] + + +@pytest.mark.parametrize("pattern,subject,pos,iter_length", test_data_pattern_scan_length) +def test_pattern_scan_length(pattern, subject, pos, iter_length): + p = pcre2.compile(pattern) + s = p.finditer(subject, pos=pos) + assert len(list(iter(s))) == iter_length + + +test_pattern_substitute = [ + (b"[abc]*", b"", b"dabacbaccbacccb", 1, b"dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", 1, "dabacbaccbacccb"), + ("[abc]*", "", "dabacbaccbacccb", 0, "d"), + ("a(•{2,})b", "a•b", "aba•ba••ba•••b", 0, "aba•ba•ba•b"), + ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", 0, "aba•ba••ba•••b"), + ("a(•{2,})b", lambda m: m[0] + m[0], "aba•ba••ba•••b", 0, "aba•ba••ba••ba•••ba•••b"), + ("a(•{2,})b", lambda m: m[1] + m[1], "aba•ba••ba•••b", 0, "aba•b••••••••••"), +] + + +@pytest.mark.parametrize("pattern,replacement,subject,count,result", test_pattern_substitute) +def test_pattern_substitute(pattern, replacement, subject, count, result): + p = pcre2.compile(pattern) + assert p.sub(replacement, subject, count) == result + + +def test_pattern_findall(): + p = pcre2.compile(r"(\w+)=(\d+)") + assert p.findall("set width=20 and height=10") == [("width", "20"), ("height", "10")] + s = bytes(range(128)).decode() + p2 = pcre2.compile(r"[0-9--1]") + assert p2.findall(s) == list("-./0123456789") + p3 = pcre2.compile(r"[%--1]") + assert p3.findall(s) == list("%&'()*+,-1") + p4 = pcre2.compile(r"[%--]") + assert p4.findall(s) == list("%&'()*+,-") + p5 = pcre2.compile(r"[0-9&&1]") + assert p5.findall(s) == list("&0123456789") + p6 = pcre2.compile(r"[\d&&1]") + assert p6.findall(s) == list("&0123456789") + p7 = pcre2.compile(r"[0-9||a]") + assert p7.findall(s) == list("0123456789a|") + p8 = pcre2.compile(r"[\d||a]") + assert p8.findall(s) == list("0123456789a|") + p9 = pcre2.compile(r"[0-9~~1]") + assert p9.findall(s) == list("0123456789~") + p10 = pcre2.compile(r"[\d~~1]") + assert p10.findall(s) == list("0123456789~") + p11 = pcre2.compile(r"[[0-9]|]") + assert p11.findall(s) == list("0123456789[]") + + for reps in "*", "+", "?", "{1}": + for mod in "", "?": + pattern = "." + reps + mod + "yz" + assert pcre2.compile(pattern, pcre2.S).findall("xyz") == ["xyz"], pattern + pattern = pattern.encode() + assert pcre2.compile(pattern, pcre2.S).findall(b"xyz") == [b"xyz"], pattern + + +def test_pattern_jit_findall(): + assert pcre2.findall(r"(\w+)=(\d+)", "set width=20 and height=10") == [ + ("width", "20"), + ("height", "10"), + ] + assert pcre2.findall(":+", "abc") == [] + assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"] + assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"] + + for x in ("\xe0", "\u0430", "\U0001d49c"): + xx = x * 2 + xxx = x * 3 + string = "a%sb%sc%sd" % (x, xx, xxx) + assert pcre2.findall("%s+" % x, string) == [x, xx, xxx] + assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx] + + assert len(pcre2.findall(r"\b", "a")) == 2 + assert len(pcre2.findall(r"\B", "a")) == 0 + assert len(pcre2.findall(r"\b", " ")) == 0 + assert len(pcre2.findall(r"\b", " ")) == 0 + assert len(pcre2.findall(r"\B", " ")) == 2 + + s = bytes(range(128)).decode() + assert pcre2.findall(r"[--1]", s) == list("-./01") + assert pcre2.findall(r"[&&1]", s) == list("&1") + assert pcre2.findall(r"[||1]", s) == list("1|") + assert pcre2.findall(r"[~~1]", s) == list("1~") + + assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ["a"] + + assert pcre2.findall(r"a++", "aab") == ["aa"] + assert pcre2.findall(r"a*+", "aab") == ["aa", "", ""] + assert pcre2.findall(r"a?+", "aab") == ["a", "a", "", ""] + assert pcre2.findall(r"a{1,3}+", "aab") == ["aa"] + + assert pcre2.findall(r"(?:ab)++", "ababc") == ["abab"] + assert pcre2.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""] + assert pcre2.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""] + assert pcre2.findall(r"(?:ab){1,3}+", "ababc") == ["abab"] + + assert pcre2.findall(r"(?>a+)", "aab") == ["aa"] + assert pcre2.findall(r"(?>a*)", "aab") == ["aa", "", ""] + assert pcre2.findall(r"(?>a?)", "aab") == ["a", "a", "", ""] + assert pcre2.findall(r"(?>a{1,3})", "aab") == ["aa"] + + assert pcre2.findall(r"(?>(?:ab)+)", "ababc") == ["abab"] + assert pcre2.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""] + assert pcre2.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""] + assert pcre2.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"] + + import re + + b = "y\u2620y\u2620y".encode("utf-8") + assert len(pcre2.findall(re.escape("\u2620".encode("utf-8")), b)) == 2 + + +def test_pattern_split(): + pattern = "[\u002e\u3002\uff0e\uff61]" + assert pcre2.compile(pattern).split("a.b.c") == ["a", "b", "c"] + + +def test_pattern_jit_split(): + assert pcre2.split(":", ":a:b::c") == ["", "a", "b", "", "c"] + assert pcre2.split(":+", ":a:b::c") == ["", "a", "b", "c"] + assert pcre2.split("(:+)", ":a:b::c") == ["", ":", "a", ":", "b", "::", "c"] + + assert pcre2.split(b":", b":a:b::c") == [b"", b"a", b"b", b"", b"c"] + assert pcre2.split(b":+", b":a:b::c") == [b"", b"a", b"b", b"c"] + assert pcre2.split(b"(:+)", b":a:b::c") == [b"", b":", b"a", b":", b"b", b"::", b"c"] + + for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"): + string = ":%s:%s::%s" % (a, b, c) + assert pcre2.split(":", string) == ["", a, b, "", c] + assert pcre2.split(":+", string) == ["", a, b, c] + assert pcre2.split("(:+)", string) == ["", ":", a, ":", b, "::", c] + + assert pcre2.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"] + assert pcre2.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"] + assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"] + + assert pcre2.split(":", ":a:b::c", 2) == ["", "a", "b::c"] + assert pcre2.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"] + assert pcre2.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"] + assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"] + assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"] diff --git a/tests/test_re_compatibility.py b/tests/test_re_compatibility.py new file mode 100644 index 0000000..8909512 --- /dev/null +++ b/tests/test_re_compatibility.py @@ -0,0 +1,1918 @@ +import pcre2 as re +import string +import multiprocessing +from weakref import proxy +import pytest + +from tests.utils import ( + assert_raises, + assert_typed_equal, + check_pattern_error, + check_template_error, +) + +# This file is a modified version of the tests from CPython's regex test suite, meant to provide +# coverage for the built-in module's behavior. However, the intention is not to cover 100% of +# Python tests. Some functionality will remain different, such as the equality of compiled +# patterns. The goal is to cover enough of the API to make using PCRE2 feel like using the built-in +# module. For the tests included, you can find original versions in the link below (Python bug IDs +# are preserved for searching): +# https://github.com/python/cpython/blob/3.14/Lib/test/test_re.py + + +class S(str): + def __getitem__(self, index): + return S(super().__getitem__(index)) + + +class B(bytes): + def __getitem__(self, index): + return B(super().__getitem__(index)) + + +def test_weakref(): + s = "QabbbcR" + x = re.compile("ab+c") + y = proxy(x) + assert x.findall("QabbbcR") == y.findall("QabbbcR") + + +def test_search_star_plus(): + assert re.search("x*", "axx").span(0) == (0, 0) + assert re.search("x*", "axx").span() == (0, 0) + assert re.search("x+", "axx").span(0) == (1, 3) + assert re.search("x+", "axx").span() == (1, 3) + assert re.search("x", "aaa") is None + assert re.match("a*", "xxx").span(0) == (0, 0) + assert re.match("a*", "xxx").span() == (0, 0) + assert re.match("x*", "xxxa").span(0) == (0, 3) + assert re.match("x*", "xxxa").span() == (0, 3) + assert re.match("a+", "xxx") is None + + +def test_branching(): + """Test Branching + Test expressions using the OR ('|') operator.""" + assert re.match("(ab|ba)", "ab").span() == (0, 2) + assert re.match("(ab|ba)", "ba").span() == (0, 2) + assert re.match("(abc|bac|ca|cb)", "abc").span() == (0, 3) + assert re.match("(abc|bac|ca|cb)", "bac").span() == (0, 3) + assert re.match("(abc|bac|ca|cb)", "ca").span() == (0, 2) + assert re.match("(abc|bac|ca|cb)", "cb").span() == (0, 2) + assert re.match("((a)|(b)|(c))", "a").span() == (0, 1) + assert re.match("((a)|(b)|(c))", "b").span() == (0, 1) + assert re.match("((a)|(b)|(c))", "c").span() == (0, 1) + + +def bump_num(matchobj): + int_value = int(matchobj.group(0)) + return str(int_value + 1) + + +def test_basic_re_sub(): + assert_typed_equal(re.sub("y", "a", "xyz"), "xaz") + assert_typed_equal(re.sub("y", S("a"), S("xyz")), "xaz") + assert_typed_equal(re.sub(b"y", b"a", b"xyz"), b"xaz") + assert_typed_equal(re.sub(b"y", B(b"a"), B(b"xyz")), b"xaz") + assert_typed_equal(re.sub(b"y", bytearray(b"a"), bytearray(b"xyz")), b"xaz") + assert_typed_equal(re.sub(b"y", memoryview(b"a"), memoryview(b"xyz")), b"xaz") + + for y in ("\xe0", "\u0430", "\U0001d49c"): + assert re.sub(y, "a", "x%sz" % y) == "xaz" + + assert re.sub("(?i)b+", "x", "bbbb BBBB") == "x x" + assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y") == "9.3 -3 24x100y" + + assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y", count=3) == "9.3 -3 23x99y" + + assert re.sub(".", lambda m: r"\n", "x") == "\\n" + assert re.sub(".", r"\n", "x") == "\n" + + s = r"\g<1>\g<1>" + assert re.sub("(.)", s, "x") == "xx" + assert re.sub("(.)", s.replace("\\", r"\\"), "x") == s + assert re.sub("(.)", lambda m: s, "x") == s + + assert re.sub("(?Px)", r"\g\g", "xx") == "xxxx" + assert re.sub("(?Px)", r"\g\g<1>", "xx") == "xxxx" + assert re.sub("(?Px)", r"\g\g", "xx") == "xxxx" + assert re.sub("(?Px)", r"\g<1>\g<1>", "xx") == "xxxx" + assert re.sub("()x", r"\g<0>\g<0>", "xx") == "xxxx" + + assert re.sub("a", r"\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b" + assert re.sub("a", "\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b" + assert re.sub("a", "\t\n\v\r\f\a\b", "a") == ( + chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7) + chr(8) + ) + + # Note that we removed the reserved characters in PCRE2 extended substitution syntax + for c in "cdhijkmopqswxyzABCDFGHIJKMNOPRSTVWXYZ": + with pytest.raises(re.LibraryError): + assert re.sub("a", "\\" + c, "a") == "\\" + c + + assert re.sub(r"^\s*", "X", "test") == "Xtest" + + +def test_bug_449964(): + # fails for group followed by other escape + assert re.sub(r"(?Px)", r"\g<1>\g<1>\b", "xx") == "xx\bxx\b" + + +def test_bug_449000(): + # Test for sub() on escaped characters + assert re.sub(r"\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n" + assert re.sub("\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n" + assert re.sub(r"\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n" + assert re.sub("\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n" + + +def test_bug_1661(): + # Verify that flags do not get silently ignored with compiled patterns + pattern = re.compile(".") + assert_raises(ValueError, re.match, pattern, "A", re.I) + assert_raises(ValueError, re.search, pattern, "A", re.I) + assert_raises(ValueError, re.findall, pattern, "A", re.I) + assert_raises(ValueError, re.compile, pattern, re.I) + + +def test_bug_3629(): + # A regex that triggered a bug in the sre-code validator + re.compile("(?P)(?(quote))") + + +def test_sub_template_numeric_escape(): + # bug 776311 and friends + assert re.sub("x", r"\0", "x") == "\0" + assert re.sub("x", r"\000", "x") == "\000" + assert re.sub("x", r"\001", "x") == "\001" + assert re.sub("x", r"\008", "x") == "\0" + "8" + assert re.sub("x", r"\009", "x") == "\0" + "9" + assert re.sub("x", r"\111", "x") == "\111" + assert re.sub("x", r"\117", "x") == "\117" + assert re.sub("x", r"\377", "x") == "\377" + + assert re.sub("x", r"\1111", "x") == "\1111" + assert re.sub("x", r"\1111", "x") == "\111" + "1" + + assert re.sub("x", r"\00", "x") == "\x00" + assert re.sub("x", r"\07", "x") == "\x07" + assert re.sub("x", r"\08", "x") == "\0" + "8" + assert re.sub("x", r"\09", "x") == "\0" + "9" + assert re.sub("x", r"\0a", "x") == "\0" + "a" + + # in python2.3 (etc), these loop endlessly in sre_parser.py + + assert re.sub("(((((((((((x)))))))))))", r"\11", "x") == "x" + assert re.sub("((((((((((y))))))))))(.)", r"\11a", "xyz") == "xza" + + # Modified for different parsing behavior in PCRE2 + assert re.sub("((((((((((y))))))))))(.)", r"\g<11>8", "xyz") == "xz8" + + +def test_qualified_re_sub(): + assert re.sub("a", "b", "aaaaa") == "bbbbb" + assert re.sub("a", "b", "aaaaa", count=1) == "baaaa" + + with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'count'"): + re.sub("a", "b", "aaaaa", 1, count=1) + with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'flags'"): + re.sub("a", "b", "aaaaa", 1, 0, flags=0) + with pytest.raises( + TypeError, match=r"sub\(\) takes from 3 to 6 positional arguments but 7 were given" + ): + re.sub("a", "b", "aaaaa", 1, 0, False, 0) + + +def test_bug_114660(): + assert re.sub(r"(\S)\s+(\S)", r"\1 \2", "hello there") == "hello there" + + +def test_symbolic_groups(): + re.compile(r"(?Px)(?P=a)(?(a)y)") + re.compile(r"(?Px)(?P=a1)(?(a1)y)") + re.compile(r"(?Px)\1(?(1)y)") + re.compile(b"(?Px)(?P=a1)(?(a1)y)") + # New valid identifiers in Python 3 + re.compile("(?P<µ>x)(?P=µ)(?(µ)y)") + re.compile("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)") + # Support > 100 groups. + pat = "|".join("x(?P%x)y" % (i, i) for i in range(1, 200 + 1)) + pat = "(?:%s)(?(200)z|t)" % pat + assert re.match(pat, "xc8yz").span() == (0, 5) + + +def test_symbolic_groups_errors(): + # This test originally tested error messages, but we only test failure of compilation as + # messages are managed bt PCRE2 + check_pattern_error(r"(?P)(?P)") + check_pattern_error(r"(?Pxy)") + check_pattern_error(r"(?P)(?P=a") + check_pattern_error(r"(?P=") + check_pattern_error(r"(?P=)aaaaaaaaaaaaaaa") + check_pattern_error(r"(?P=1)") + check_pattern_error(r"(?P=a)") + check_pattern_error(r"(?P=a1)") + check_pattern_error(r"(?P=a.)") + check_pattern_error(r"(?P<)") + check_pattern_error(r"(?P)") + check_pattern_error(r"(?P<1>)") + check_pattern_error(r"(?P)") + check_pattern_error(r"(?(") + check_pattern_error(r"(?())") + check_pattern_error(r"(?(a))") + check_pattern_error(r"(?(-1))") + check_pattern_error(r"(?(1a))") + check_pattern_error(r"(?(a.))") + check_pattern_error("(?P<©>x)") + check_pattern_error("(?P=©)") + check_pattern_error("(?(©)y)") + check_pattern_error(b"(?P<\xc2\xb5>x)") + check_pattern_error(b"(?P=\xc2\xb5)") + check_pattern_error(b"(?(\xc2\xb5)y)") + + +def test_symbolic_refs(): + assert re.sub("(?Px)|(?Py)", r"\g", "xx") == "" + assert re.sub("(?Px)|(?Py)", r"\2", "xx") == "" + assert re.sub(b"(?Px)", rb"\g", b"xx") == b"xx" + # New valid identifiers in Python 3 + assert re.sub("(?P<µ>x)", r"\g<µ>", "xx") == "xx" + assert re.sub("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)", r"\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>", "xx") == "xx" + # Support > 100 groups. + pat = "|".join("x(?P%x)y" % (i, i) for i in range(1, 200 + 1)) + assert re.sub(pat, r"\g<200>", "xc8yzxc8y") == "c8zc8" + + +def test_symbolic_refs_errors(): + check_template_error("(?Px)", r"\gx)", r"\g<", "xx") + check_template_error("(?Px)", r"\g", "xx") + check_template_error("(?Px)", r"\g", "xx") + check_template_error("(?Px)", r"\g<>", "xx") + check_template_error("(?Px)", r"\g<1a1>", "xx") + check_template_error("(?Px)", r"\g<2>", "xx") + check_template_error("(?Px)", r"\2", "xx") + check_template_error("(?Px)", r"\g", "xx") + check_template_error("(?Px)", r"\g<-1>", "xx") + check_template_error("(?Px)", r"\g<+1>", "xx") + check_template_error("()" * 10, r"\g<1_0>", "xx") + check_template_error("(?Px)", r"\g< 1 >", "xx") + check_template_error("(?Px)", r"\g<©>", "xx") + check_template_error(b"(?Px)", b"\\g<\xc2\xb5>", b"xx") + check_template_error("(?Px)", r"\g<㊀>", "xx") + check_template_error("(?Px)", r"\g<¹>", "xx") + check_template_error("(?Px)", r"\g<१>", "xx") + + +def test_re_subn(): + assert re.subn("(?i)b+", "x", "bbbb BBBB") == ("x x", 2) + assert re.subn("b+", "x", "bbbb BBBB") == ("x BBBB", 1) + assert re.subn("b+", "x", "xyz") == ("xyz", 0) + assert re.subn("b*", "x", "xyz") == ("xxxyxzx", 4) + assert re.subn("b*", "x", "xyz", count=2) == ("xxxyz", 2) + + with pytest.raises(TypeError): + re.subn("a", "b", "aaaaa", 1, count=1) + with pytest.raises(TypeError): + re.subn("a", "b", "aaaaa", 1, 0, flags=0) + + +def test_re_split(): + for string in (":a:b::c", S(":a:b::c")): + assert_typed_equal(re.split(":", string), ["", "a", "b", "", "c"]) + assert_typed_equal(re.split(":+", string), ["", "a", "b", "c"]) + assert_typed_equal(re.split("(:+)", string), ["", ":", "a", ":", "b", "::", "c"]) + for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), memoryview(b":a:b::c")): + assert_typed_equal(re.split(b":", string), [b"", b"a", b"b", b"", b"c"]) + assert_typed_equal(re.split(b":+", string), [b"", b"a", b"b", b"c"]) + assert_typed_equal(re.split(b"(:+)", string), [b"", b":", b"a", b":", b"b", b"::", b"c"]) + for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"): + string = ":%s:%s::%s" % (a, b, c) + assert re.split(":", string) == ["", a, b, "", c] + assert re.split(":+", string) == ["", a, b, c] + assert re.split("(:+)", string) == ["", ":", a, ":", b, "::", c] + + assert re.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"] + assert re.split("(:)+", ":a:b::c") == ["", ":", "a", ":", "b", ":", "c"] + assert re.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"] + assert re.split("(b)|(:+)", ":a:b::c") == [ + "", + None, + ":", + "a", + None, + ":", + "", + "b", + None, + "", + None, + "::", + "c", + ] + assert re.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"] + + for sep, expected in [ + (":*", ["", "", "a", "", "b", "", "c", ""]), + ("(?::*)", ["", "", "a", "", "b", "", "c", ""]), + ("(:*)", ["", ":", "", "", "a", ":", "", "", "b", "::", "", "", "c", "", ""]), + ("(:)*", ["", ":", "", None, "a", ":", "", None, "b", ":", "", None, "c", None, ""]), + ]: + assert_typed_equal(re.split(sep, ":a:b::c"), expected) + + for sep, expected in [ + ("", ["", ":", "a", ":", "b", ":", ":", "c", ""]), + (r"\b", [":", "a", ":", "b", "::", "c", ""]), + (r"(?=:)", ["", ":a", ":b", ":", ":c"]), + (r"(?<=:)", [":", "a:", "b:", ":", "c"]), + ]: + assert_typed_equal(re.split(sep, ":a:b::c"), expected) + + +def test_qualified_re_split(): + assert re.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"] + assert re.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"] + assert re.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"] + assert re.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"] + assert re.split("(:*)", ":a:b::c", maxsplit=2) == ["", ":", "", "", "a:b::c"] + + with pytest.raises(TypeError): + re.split(":", ":a:b::c", 2, maxsplit=2) + with pytest.raises(TypeError): + re.split(":", ":a:b::c", 2, 0, flags=0) + + +def test_re_findall(): + assert re.findall(":+", "abc") == [] + for string in ("a:b::c:::d", S("a:b::c:::d")): + assert_typed_equal(re.findall(":+", string), [":", "::", ":::"]) + assert_typed_equal(re.findall("(:+)", string), [":", "::", ":::"]) + assert_typed_equal(re.findall("(:)(:*)", string), [(":", ""), (":", ":"), (":", "::")]) + for string in ( + b"a:b::c:::d", + B(b"a:b::c:::d"), + bytearray(b"a:b::c:::d"), + memoryview(b"a:b::c:::d"), + ): + assert_typed_equal(re.findall(b":+", string), [b":", b"::", b":::"]) + assert_typed_equal(re.findall(b"(:+)", string), [b":", b"::", b":::"]) + assert_typed_equal( + re.findall(b"(:)(:*)", string), [(b":", b""), (b":", b":"), (b":", b"::")] + ) + for x in ("\xe0", "\u0430", "\U0001d49c"): + xx = x * 2 + xxx = x * 3 + string = "a%sb%sc%sd" % (x, xx, xxx) + assert re.findall("%s+" % x, string) == [x, xx, xxx] + assert re.findall("(%s+)" % x, string) == [x, xx, xxx] + assert re.findall("(%s)(%s*)" % (x, x), string), [(x, ""), (x, x) == (x, xx)] + + +def test_bug_117612(): + assert re.findall(r"(a|(b))", "aba"), [("a", ""), ("b", "b") == ("a", "")] + + +def test_re_match(): + for string in ("a", S("a")): + assert re.match("a", string).groups() == () + assert re.match("(a)", string).groups() == ("a",) + assert re.match("(a)", string).group(0) == "a" + assert re.match("(a)", string).group(1) == "a" + assert re.match("(a)", string).group(1, 1) == ("a", "a") + for string in (b"a", B(b"a"), bytearray(b"a"), memoryview(b"a")): + assert re.match(b"a", string).groups() == () + assert re.match(b"(a)", string).groups() == (b"a",) + assert re.match(b"(a)", string).group(0) == b"a" + assert re.match(b"(a)", string).group(1) == b"a" + assert re.match(b"(a)", string).group(1, 1) == (b"a", b"a") + for a in ("\xe0", "\u0430", "\U0001d49c"): + assert re.match(a, a).groups() == () + assert re.match("(%s)" % a, a).groups() == (a,) + assert re.match("(%s)" % a, a).group(0) == a + assert re.match("(%s)" % a, a).group(1) == a + assert re.match("(%s)" % a, a).group(1, 1) == (a, a) + + pat = re.compile("((a)|(b))(c)?") + assert pat.match("a").groups() == ("a", "a", None, None) + assert pat.match("b").groups() == ("b", None, "b", None) + assert pat.match("ac").groups() == ("a", "a", None, "c") + assert pat.match("bc").groups() == ("b", None, "b", "c") + assert pat.match("bc").groups("") == ("b", "", "b", "c") + + pat = re.compile("(?:(?Pa)|(?Pb))(?Pc)?") + assert pat.match("a").group(1, 2, 3) == ("a", None, None) + assert pat.match("b").group("a1", "b2", "c3") == (None, "b", None) + assert pat.match("ac").group(1, "b2", 3) == ("a", None, "c") + + +def test_group(): + class Index: + def __init__(self, value): + self.value = value + + def __index__(self): + return self.value + + # A single group + m = re.match("(a)(b)", "ab") + assert m.group() == "ab" + assert m.group(0) == "ab" + assert m.group(1) == "a" + assert m.group(Index(1)) == "a" + assert_raises(IndexError, m.group, -1) + assert_raises(IndexError, m.group, 3) + assert_raises(IndexError, m.group, 1 << 1000) + + # Unclear why the below fails + # assert_raises(IndexError, m.group, Index(1 << 1000)) + + assert_raises(IndexError, m.group, "x") + # Multiple groups + assert m.group(2, 1) == ("b", "a") + assert m.group(Index(2), Index(1)) == ("b", "a") + + +def test_match_getitem(): + pat = re.compile("(?:(?Pa)|(?Pb))(?Pc)?") + + m = pat.match("a") + assert m["a1"] == "a" + assert m["b2"] == None + assert m["c3"] == None + assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=None" + assert m[0] == "a" + assert m[1] == "a" + assert m[2] == None + assert m[3] == None + with pytest.raises(IndexError): + m["X"] + with pytest.raises(IndexError): + m[-1] + with pytest.raises(IndexError): + m[4] + with pytest.raises(IndexError): + m[0, 1] + with pytest.raises(IndexError): + m[(0,)] + with pytest.raises(IndexError): + m[(0, 1)] + with pytest.raises(IndexError): + "a1={a2}".format_map(m) + + m = pat.match("ac") + assert m["a1"] == "a" + assert m["b2"] == None + assert m["c3"] == "c" + assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=c" + assert m[0] == "ac" + assert m[1] == "a" + assert m[2] == None + assert m[3] == "c" + + # Cannot assign. + with pytest.raises(TypeError): + m[0] = 1 + + # No len(). + assert_raises(TypeError, len, m) + + +def test_re_fullmatch(): + # Issue 16203: Proposal: add re.fullmatch() method. + assert re.fullmatch(r"a", "a").span() == (0, 1) + for string in "ab", S("ab"): + assert re.fullmatch(r"a|ab", string).span() == (0, 2) + for string in (b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab")): + assert re.fullmatch(rb"a|ab", string).span() == (0, 2) + for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": + r = r"%s|%s" % (a, a + b) + assert re.fullmatch(r, a + b).span() == (0, 2) + assert re.fullmatch(r".*?$", "abc").span() == (0, 3) + assert re.fullmatch(r".*?", "abc").span() == (0, 3) + assert re.fullmatch(r"a.*?b", "ab").span() == (0, 2) + assert re.fullmatch(r"a.*?b", "abb").span() == (0, 3) + assert re.fullmatch(r"a.*?b", "axxb").span() == (0, 4) + assert re.fullmatch(r"a+", "ab") is None + assert re.fullmatch(r"abc$", "abc\n") is None + assert re.fullmatch(r"abc\z", "abc\n") is None + assert re.fullmatch(r"abc\Z", "abc\n") is None + assert re.fullmatch(r"(?m)abc$", "abc\n") is None + assert re.fullmatch(r"ab(?=c)cd", "abcd").span() == (0, 4) + assert re.fullmatch(r"ab(?<=b)cd", "abcd").span() == (0, 4) + assert re.fullmatch(r"(?=a|ab)ab", "ab").span() == (0, 2) + + assert re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) + assert re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) + assert re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3) + + +def test_re_groupref_exists(): + assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a)").groups() == ("(", "a") + assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a").groups() == (None, "a") + assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a)") is None + assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a") is None + assert re.match("^(?:(a)|c)((?(1)b|d))$", "ab").groups() == ("a", "b") + assert re.match(r"^(?:(a)|c)((?(1)b|d))$", "cd").groups() == (None, "d") + assert re.match(r"^(?:(a)|c)((?(1)|d))$", "cd").groups() == (None, "d") + assert re.match(r"^(?:(a)|c)((?(1)|d))$", "a").groups() == ("a", "") + + # Tests for bug #1177831: exercise groups other than the first group + p = re.compile("(?Pa)(?Pb)?((?(g2)c|d))") + assert p.match("abc").groups() == ("a", "b", "c") + assert p.match("ad").groups() == ("a", None, "d") + assert p.match("abd") is None + assert p.match("ac") is None + + # Support > 100 groups. + pat = "|".join("x(?P%x)y" % (i, i) for i in range(1, 200 + 1)) + pat = "(?:%s)(?(200)z)" % pat + assert re.match(pat, "xc8yz").span() == (0, 5) + + +def test_re_groupref_exists_errors(): + check_pattern_error(r"(?P)(?(0)a|b)") + check_pattern_error(r"()(?(+1)a|b)") + check_pattern_error(r"()" * 10 + r"(?(1_0)a|b)") + check_pattern_error(r"()(?( 1 )a|b)") + check_pattern_error(r"()(?(㊀)a|b)") + check_pattern_error(r"()(?(¹)a|b)") + check_pattern_error(r"()(?(१)a|b)") + check_pattern_error(r"()(?(1") + check_pattern_error(r"()(?(1)a") + check_pattern_error(r"()(?(1)a|b") + check_pattern_error(r"()(?(1)a|b|c") + check_pattern_error(r"()(?(1)a|b|c)") + check_pattern_error(r"()(?(2)a)") + + +def test_re_groupref_exists_validation_bug(): + for i in range(256): + re.compile(r"()(?(1)\x%02x?)" % i) + + +def test_re_groupref(): + assert re.match(r"^(\|)?([^()]+)\1$", "|a|").groups() == ("|", "a") + assert re.match(r"^(\|)?([^()]+)\1?$", "a").groups() == (None, "a") + assert re.match(r"^(\|)?([^()]+)\1$", "a|") is None + assert re.match(r"^(\|)?([^()]+)\1$", "|a") is None + assert re.match(r"^(?:(a)|c)(\1)$", "aa").groups() == ("a", "a") + assert re.match(r"^(?:(a)|c)(\1)?$", "c").groups() == (None, None) + + +def test_groupdict(): + assert re.match("(?Pfirst) (?Psecond)", "first second").groupdict() == { + "first": "first", + "second": "second", + } + + +def test_expand(): + assert ( + re.match("(?Pfirst) (?Psecond)", "first second").expand( + r"\2 \1 \g \g" + ) + == "second first second first" + ) + assert re.match("(?Pfirst)|(?Psecond)", "first").expand(r"\2 \g") == " " + + +def test_repeat_minmax(): + assert re.match(r"^(\w){1}$", "abc") is None + assert re.match(r"^(\w){1}?$", "abc") is None + assert re.match(r"^(\w){1,2}$", "abc") is None + assert re.match(r"^(\w){1,2}?$", "abc") is None + + assert re.match(r"^(\w){3}$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,3}$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,4}$", "abc").group(1) == "c" + assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c" + assert re.match(r"^(\w){3}?$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,3}?$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,4}?$", "abc").group(1) == "c" + assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c" + + assert re.match(r"^x{1}$", "xxx") is None + assert re.match(r"^x{1}?$", "xxx") is None + assert re.match(r"^x{1,2}$", "xxx") is None + assert re.match(r"^x{1,2}?$", "xxx") is None + + assert re.match(r"^x{3}$", "xxx") + assert re.match(r"^x{1,3}$", "xxx") + assert re.match(r"^x{3,3}$", "xxx") + assert re.match(r"^x{1,4}$", "xxx") + assert re.match(r"^x{3,4}?$", "xxx") + assert re.match(r"^x{3}?$", "xxx") + assert re.match(r"^x{1,3}?$", "xxx") + assert re.match(r"^x{1,4}?$", "xxx") + assert re.match(r"^x{3,4}?$", "xxx") + + assert re.match(r"^x{}$", "xxx") is None + assert re.match(r"^x{}$", "x{}") + + check_pattern_error(r"x{2,1}") + + +def test_getattr(): + assert re.compile("(?i)(a)(b)").pattern == "(?i)(a)(b)" + # assert re.compile("(?i)(a)(b)").flags == re.I | re.U # TODO: Look into why not + assert re.compile("(?i)(a)(b)").groups == 2 + assert re.compile("(?i)(a)(b)").groupindex == {} + assert re.compile("(?i)(?Pa)(?Pb)").groupindex == {"first": 1, "other": 2} + + assert re.match("(a)", "a").pos == 0 + assert re.match("(a)", "a").endpos == 1 + assert re.match("(a)", "a").string == "a" + assert re.match("(a)", "a").re + + # Issue 14260. groupindex should be non-modifiable mapping. + p = re.compile(r"(?i)(?Pa)(?Pb)") + assert sorted(p.groupindex) == ["first", "other"] + assert p.groupindex["other"] == 2 + + with pytest.raises(TypeError): + p.groupindex["other"] = 0 + + assert p.groupindex["other"] == 2 + + +def test_special_escapes(): + assert re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1) == "bx" + assert re.search(r"\B(b.)\B", "abc bcd bc abxd").group(1) == "bx" + + # TODO: Add ASCII + assert re.search(r"\b(b.)\b", "abcd abc bcd bx", re.ASCII).group(1) == "bx" + assert re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1) == "bx" + + assert re.search(r"^abc$", "\nabc\n", re.M).group(0) == "abc" + assert re.search(r"^\Aabc\z$", "abc", re.M).group(0) == "abc" + assert re.search(r"^\Aabc\z$", "\nabc\n", re.M) is None + assert re.search(r"^\Aabc\Z$", "abc", re.M).group(0) == "abc" + assert re.search(r"^\Aabc\Z$", "\nabc\n", re.M) is None + assert re.search(rb"\b(b.)\b", b"abcd abc bcd bx").group(1) == b"bx" + assert re.search(rb"\B(b.)\B", b"abc bcd bc abxd").group(1) == b"bx" + assert re.search(rb"^abc$", b"\nabc\n", re.M).group(0) == b"abc" + assert re.search(rb"^\Aabc\z$", b"abc", re.M).group(0) == b"abc" + assert re.search(rb"^\Aabc\z$", b"\nabc\n", re.M) is None + assert re.search(rb"^\Aabc\Z$", b"abc", re.M).group(0) == b"abc" + assert re.search(rb"^\Aabc\Z$", b"\nabc\n", re.M) is None + assert re.search(r"\d\D\w\W\s\S", "1aa! a").group(0) == "1aa! a" + assert re.search(rb"\d\D\w\W\s\S", b"1aa! a").group(0) == b"1aa! a" + assert re.search(r"\d\D\w\W\s\S", "1aa! a", re.ASCII).group(0) == "1aa! a" + + +def test_other_escapes(): + check_pattern_error("\\") + + assert re.match(r"\(", "(").group() == "(" + assert re.match(r"\(", ")") is None + assert re.match(r"\\", "\\").group() == "\\" + assert re.match(r"[\]]", "]").group() == "]" + assert re.match(r"[\]]", "[") is None + assert re.match(r"[a\-c]", "-").group() == "-" + assert re.match(r"[a\-c]", "b") is None + assert re.match(r"[\^a]+", "a^").group() == "a^" + assert re.match(r"[\^a]+", "b") is None + + for c in "cijlmopqyCFIJLMOPTY": + check_pattern_error("\\%c" % c) + for c in "cijlmopqyzABCFIJLMOPTYZ": + check_pattern_error("[\\%c]" % c) + + +def test_word_boundaries(): + # See http://bugs.python.org/issue10713 + assert re.search(r"\b(abc)\b", "abc").group(1) == "abc" + assert re.search(r"\b(abc)\b", "abc", re.ASCII).group(1) == "abc" + assert re.search(rb"\b(abc)\b", b"abc").group(1) == b"abc" + assert re.search(r"\b(ьюя)\b", "ьюя").group(1) == "ьюя" + assert re.search(r"\b(ьюя)\b", "ьюя", re.ASCII) is None + # There's a word boundary between a word and a non-word. + assert re.match(r".\b", "a=") + assert re.match(r".\b", "a=", re.ASCII) + assert re.match(rb".\b", b"a=") + assert re.match(r".\b", "я=") + assert re.match(r".\b", "я=", re.ASCII) is None + # There's a word boundary between a non-word and a word. + assert re.match(r".\b", "=a") + assert re.match(r".\b", "=a", re.ASCII) + assert re.match(rb".\b", b"=a") + assert re.match(r".\b", "=я") + assert re.match(r".\b", "=я", re.ASCII) is None + # There is no word boundary inside a word. + assert re.match(r".\b", "ab") is None + assert re.match(r".\b", "ab", re.ASCII) is None + assert re.match(rb".\b", b"ab") is None + assert re.match(r".\b", "юя") is None + assert re.match(r".\b", "юя", re.ASCII) is None + # There is no word boundary between a non-word characters. + assert re.match(r".\b", "=-") is None + assert re.match(r".\b", "=-", re.ASCII) is None + assert re.match(rb".\b", b"=-") is None + # There is no non-boundary match between a word and a non-word. + assert re.match(r".\B", "a=") is None + assert re.match(r".\B", "a=", re.ASCII) is None + assert re.match(rb".\B", b"a=") is None + assert re.match(r".\B", "я=") is None + assert re.match(r".\B", "я=", re.ASCII) + # There is no non-boundary match between a non-word and a word. + assert re.match(r".\B", "=a") is None + assert re.match(r".\B", "=a", re.ASCII) is None + assert re.match(rb".\B", b"=a") is None + assert re.match(r".\B", "=я") is None + assert re.match(r".\B", "=я", re.ASCII) + # There's a non-boundary match inside a word. + assert re.match(r".\B", "ab") + assert re.match(r".\B", "ab", re.ASCII) + assert re.match(rb".\B", b"ab") + assert re.match(r".\B", "юя") + assert re.match(r".\B", "юя", re.ASCII) + # There's a non-boundary match between a non-word characters. + assert re.match(r".\B", "=-") + assert re.match(r".\B", "=-", re.ASCII) + assert re.match(rb".\B", b"=-") + # There's a word boundary at the start of a string. + assert re.match(r"\b", "abc") + assert re.match(r"\b", "abc", re.ASCII) + assert re.match(rb"\b", b"abc") + assert re.match(r"\b", "ьюя") + assert re.match(r"\b", "ьюя", re.ASCII) is None + # There's a word boundary at the end of a string. + assert re.fullmatch(r".+\b", "abc") + assert re.fullmatch(r".+\b", "abc", re.ASCII) + assert re.fullmatch(rb".+\b", b"abc") + assert re.fullmatch(r".+\b", "ьюя") + assert re.search(r"\b", "ьюя", re.ASCII) is None + # A non-empty string includes a non-boundary zero-length match. + assert re.search(r"\B", "abc").span() == (1, 1) + assert re.search(r"\B", "abc", re.ASCII).span() == (1, 1) + assert re.search(rb"\B", b"abc").span() == (1, 1) + assert re.search(r"\B", "ьюя").span() == (1, 1) + assert re.search(r"\B", "ьюя", re.ASCII).span() == (0, 0) + # There is no non-boundary match at the start of a string. + assert re.match(r"\B", "abc") is None + assert re.match(r"\B", "abc", re.ASCII) is None + assert re.match(rb"\B", b"abc") is None + assert re.match(r"\B", "ьюя") is None + assert re.match(r"\B", "ьюя", re.ASCII) + # There is no non-boundary match at the end of a string. + assert re.fullmatch(r".+\B", "abc") is None + assert re.fullmatch(r".+\B", "abc", re.ASCII) is None + assert re.fullmatch(rb".+\B", b"abc") is None + assert re.fullmatch(r".+\B", "ьюя") is None + assert re.fullmatch(r".+\B", "ьюя", re.ASCII) + # However, an empty string contains no word boundaries. + assert re.search(r"\b", "") is None + assert re.search(r"\b", "", re.ASCII) is None + assert re.search(rb"\b", b"") is None + assert re.search(r"\B", "") + assert re.search(r"\B", "", re.ASCII) + assert re.search(rb"\B", b"") + # A single word-character string has two boundaries, but no + # non-boundary gaps. + assert len(re.findall(r"\b", "a")) == 2 + assert len(re.findall(r"\b", "a", re.ASCII)) == 2 + assert len(re.findall(rb"\b", b"a")) == 2 + assert len(re.findall(r"\B", "a")) == 0 + assert len(re.findall(r"\B", "a", re.ASCII)) == 0 + assert len(re.findall(rb"\B", b"a")) == 0 + # If there are no words, there are no boundaries + assert len(re.findall(r"\b", " ")) == 0 + assert len(re.findall(r"\b", " ", re.ASCII)) == 0 + assert len(re.findall(rb"\b", b" ")) == 0 + assert len(re.findall(r"\b", " ")) == 0 + assert len(re.findall(r"\b", " ", re.ASCII)) == 0 + assert len(re.findall(rb"\b", b" ")) == 0 + # Can match around the whitespace. + assert len(re.findall(r"\B", " ")) == 2 + assert len(re.findall(r"\B", " ", re.ASCII)) == 2 + assert len(re.findall(rb"\B", b" ")) == 2 + + +def test_bigcharset(): + assert re.match("([\u2222\u2223])", "\u2222").group(1) == "\u2222" + + +def test_big_codesize(): + # Issue #1160 + r = re.compile("|".join(("%d" % x for x in range(5000)))) + assert r.match("1000") + assert r.match("9999") + + +def test_anyall(): + assert re.match("a.b", "a\nb", re.DOTALL).group(0) == "a\nb" + assert re.match("a.*b", "a\n\nb", re.DOTALL).group(0) == "a\n\nb" + + +def test_lookahead(): + assert re.match(r"(a(?=\s[^a]))", "a b").group(1) == "a" + assert re.match(r"(a(?=\s[^a]*))", "a b").group(1) == "a" + assert re.match(r"(a(?=\s[abc]))", "a b").group(1) == "a" + assert re.match(r"(a(?=\s[abc]*))", "a bc").group(1) == "a" + assert re.match(r"(a)(?=\s\1)", "a a").group(1) == "a" + assert re.match(r"(a)(?=\s\1*)", "a aa").group(1) == "a" + assert re.match(r"(a)(?=\s(abc|a))", "a a").group(1) == "a" + + assert re.match(r"(a(?!\s[^a]))", "a a").group(1) == "a" + assert re.match(r"(a(?!\s[abc]))", "a d").group(1) == "a" + assert re.match(r"(a)(?!\s\1)", "a b").group(1) == "a" + assert re.match(r"(a)(?!\s(abc|a))", "a b").group(1) == "a" + + # Group reference. + assert re.match(r"(a)b(?=\1)a", "aba") + assert re.match(r"(a)b(?=\1)c", "abac") is None + # Conditional group reference. + assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc") + assert re.match(r"(?:(a)|(x))b(?=(?(2)c|x))c", "abc") is None + assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc") + assert re.match(r"(?:(a)|(x))b(?=(?(1)b|x))c", "abc") is None + assert re.match(r"(?:(a)|(x))b(?=(?(1)c|x))c", "abc") + # Group used before defined. + assert re.match(r"(a)b(?=(?(2)x|c))(c)", "abc") + assert re.match(r"(a)b(?=(?(2)b|x))(c)", "abc") is None + assert re.match(r"(a)b(?=(?(1)c|x))(c)", "abc") + + +def test_lookbehind(): + assert re.match(r"ab(?<=b)c", "abc") + assert re.match(r"ab(?<=c)c", "abc") is None + assert re.match(r"ab(?a)(?Pb)?b", "ab").lastgroup == "a" + assert re.match(r"(?Pa(b))", "ab").lastgroup == "a" + assert re.match(r"((a))", "a").lastindex == 1 + + +def test_bug_418626(): + # bugs 418626 at al. -- Testing Greg Chapman's addition of op code + # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of + # pattern '*?' on a long string. + assert re.match(".*?c", 10000 * "ab" + "cd").end(0) == 20001 + assert re.match(".*?cd", 5000 * "ab" + "c" + 5000 * "ab" + "cde").end(0) == 20003 + assert re.match(".*?cd", 20000 * "abc" + "de").end(0) == 60001 + # non-simple '*?' still used to hit the recursion limit, before the + # non-recursive scheme was implemented. + assert re.search("(a|b)*?c", 10000 * "ab" + "cd", jit=False).end(0) == 20001 + + +def test_stack_overflow(): + # nasty cases that used to overflow the straightforward recursive + # implementation of repeated groups. + assert re.match("(x)*", 50000 * "x").group(1) == "x" + assert re.match("(x)*y", 50000 * "x" + "y").group(1) == "x" + assert re.match("(x)*?y", 50000 * "x" + "y").group(1) == "x" + + +def test_nothing_to_repeat(): + for reps in "*", "+", "?", "{1,2}": + for mod in "", "?": + check_pattern_error("%s%s" % (reps, mod)) + check_pattern_error("(?:%s%s)" % (reps, mod)) + + +def test_multiple_repeat(): + for outer_reps in "*", "+", "?", "{1,2}": + for outer_mod in "", "?", "+": + outer_op = outer_reps + outer_mod + for inner_reps in "*", "+", "?", "{1,2}": + for inner_mod in "", "?", "+": + if inner_mod + outer_reps in ("?", "+"): + continue + inner_op = inner_reps + inner_mod + check_pattern_error(r"x%s%s" % (inner_op, outer_op)) + + +def test_unlimited_zero_width_repeat(): + # Issue #9669 + assert re.match(r"(?:a?)*y", "z") is None + assert re.match(r"(?:a?)+y", "z") is None + assert re.match(r"(?:a?){2,}y", "z") is None + assert re.match(r"(?:a?)*?y", "z") is None + assert re.match(r"(?:a?)+?y", "z") is None + assert re.match(r"(?:a?){2,}?y", "z") is None + + +def test_bug_448951(): + # bug 448951 (similar to 429357, but with single char match) + # (Also test greedy matches.) + for op in "", "?", "*": + assert re.match(r"((.%s):)?z" % op, "z").groups() == (None, None) + assert re.match(r"((.%s):)?z" % op, "a:z").groups() == ("a:", "a") + + +def test_bug_725106(): + # capturing groups in alternatives in repeats + assert re.match("^((a)|b)*", "abc").groups() == ("b", "a") + assert re.match("^(([ab])|c)*", "abc").groups() == ("c", "b") + assert re.match("^((d)|[ab])*", "abc").groups() == ("b", None) + assert re.match("^((a)c|[ab])*", "abc").groups() == ("b", None) + assert re.match("^((a)|b)*?c", "abc").groups() == ("b", "a") + assert re.match("^(([ab])|c)*?d", "abcd").groups() == ("c", "b") + assert re.match("^((d)|[ab])*?c", "abc").groups() == ("b", None) + assert re.match("^((a)c|[ab])*?c", "abc").groups() == ("b", None) + + +def test_bug_725149(): + # mark_stack_base restoring before restoring marks + assert re.match("(a)(?:(?=(b)*)c)*", "abb").groups() == ("a", None) + assert re.match("(a)((?!(b)*))*", "abb").groups() == ("a", None, None) + + +def test_bug_764548(): + # bug 764548, re.compile() barfs on str/unicode subclasses + class my_unicode(str): + pass + + pat = re.compile(my_unicode("abc")) + assert pat.match("xyz") is None + + +def test_finditer(): + iter = re.finditer(r":+", "a:b::c:::d") + assert [item.group(0) for item in iter] == [":", "::", ":::"] + + pat = re.compile(r":+") + iter = pat.finditer("a:b::c:::d", 1, 10) + assert [item.group(0) for item in iter] == [":", "::", ":::"] + + pat = re.compile(r":+") + iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) + assert [item.group(0) for item in iter] == [":", "::", ":::"] + + pat = re.compile(r":+") + iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) + assert [item.group(0) for item in iter] == [":", "::", ":::"] + + pat = re.compile(r":+") + iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) + assert [item.group(0) for item in iter] == ["::", "::"] + + +def test_bug_926075(): + assert re.compile("bug_926075") is not re.compile(b"bug_926075") + + +def test_bug_931848(): + pattern = "[\u002e\u3002\uff0e\uff61]" + assert re.compile(pattern).split("a.b.c") == ["a", "b", "c"] + + +def test_bug_581080(): + iter = re.finditer(r"\s", "a b") + assert next(iter).span() == (1, 2) + assert_raises(StopIteration, next, iter) + + +def test_bug_817234(): + iter = re.finditer(r".*", "asdf") + assert next(iter).span() == (0, 4) + assert next(iter).span() == (4, 4) + assert_raises(StopIteration, next, iter) + + +def test_bug_6561(): + # '\d' should match characters in Unicode category 'Nd' + # (Number, Decimal Digit), but not those in 'Nl' (Number, + # Letter) or 'No' (Number, Other). + decimal_digits = [ + "\u0037", # '\N{DIGIT SEVEN}', category 'Nd' + "\u0e58", # '\N{THAI DIGIT SIX}', category 'Nd' + "\uff10", # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' + ] + for x in decimal_digits: + assert re.match(r"^\d$", x).group(0) == x + + not_decimal_digits = [ + "\u2165", # '\N{ROMAN NUMERAL SIX}', category 'Nl' + "\u3039", # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' + "\u2082", # '\N{SUBSCRIPT TWO}', category 'No' + "\u32b4", # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' + ] + for x in not_decimal_digits: + assert re.match(r"^\d$", x) is None + + +def test_inline_flags(): + # Bug #1700 + upper_char = "\u1ea0" # Latin Capital Letter A with Dot Below + lower_char = "\u1ea1" # Latin Small Letter A with Dot Below + + p = re.compile("." + upper_char, re.I | re.S) + q = p.match("\n" + lower_char) + assert q + + p = re.compile("." + lower_char, re.I | re.S) + q = p.match("\n" + upper_char) + assert q + + p = re.compile("(?i)." + upper_char, re.S) + q = p.match("\n" + lower_char) + assert q + + p = re.compile("(?i)." + lower_char, re.S) + q = p.match("\n" + upper_char) + assert q + + p = re.compile("(?is)." + upper_char) + q = p.match("\n" + lower_char) + assert q + + p = re.compile("(?is)." + lower_char) + q = p.match("\n" + upper_char) + assert q + + p = re.compile("(?s)(?i)." + upper_char) + q = p.match("\n" + lower_char) + assert q + + p = re.compile("(?s)(?i)." + lower_char) + q = p.match("\n" + upper_char) + assert q + + assert re.match("(?ix) " + upper_char, lower_char) + assert re.match("(?ix) " + lower_char, upper_char) + assert re.match(" (?i) " + upper_char, lower_char, re.X) + assert re.match("(?x) (?i) " + upper_char, lower_char) + assert re.match(" (?x) (?i) " + upper_char, lower_char, re.X) + + +def test_dollar_matches_twice(): + r"""Test that $ does not include \n + $ matches the end of string, and just before the terminating \n""" + pattern = re.compile("$") + assert pattern.sub("#", "a\nb\n") == "a\nb#\n#" + assert pattern.sub("#", "a\nb\nc") == "a\nb\nc#" + assert pattern.sub("#", "\n") == "#\n#" + + pattern = re.compile("$", re.MULTILINE) + assert pattern.sub("#", "a\nb\n") == "a#\nb#\n#" + assert pattern.sub("#", "a\nb\nc") == "a#\nb#\nc#" + assert pattern.sub("#", "\n") == "#\n#" + + +def test_bytes_str_mixing(): + # Mixing str and bytes is disallowed + pat = re.compile(".") + bpat = re.compile(b".") + assert_raises(TypeError, pat.match, b"b") + assert_raises(TypeError, bpat.match, "b") + assert_raises(TypeError, pat.sub, b"b", "c") + assert_raises(TypeError, pat.sub, "b", b"c") + assert_raises(TypeError, pat.sub, b"b", b"c") + assert_raises(TypeError, bpat.sub, b"b", "c") + assert_raises(TypeError, bpat.sub, "b", b"c") + assert_raises(TypeError, bpat.sub, "b", "c") + + +def test_ascii_and_unicode_flag(): + # String patterns + for flags in (0, re.UNICODE): + pat = re.compile("\xc0", flags | re.IGNORECASE) + assert pat.match("\xe0") + pat = re.compile(r"\w", flags) + assert pat.match("\xe0") + pat = re.compile(r"\w", re.ASCII) + assert pat.match("\xe0") is None + pat = re.compile(r"(?a)\w") + assert pat.match("\xe0") is None + # Bytes patterns + for flags in (0, re.ASCII): + pat = re.compile(b"\xc0", flags | re.IGNORECASE) + assert pat.match(b"\xe0") is None + pat = re.compile(rb"\w", flags) + assert pat.match(b"\xe0") is None + # Incompatibilities + check_pattern_error(rb"(?u)\w") + assert_raises(re.PatternError, re.compile, r"(?u)\w", re.ASCII) + check_pattern_error(r"(?au)\w") + + +def test_scoped_flags(): + assert re.match(r"(?i:a)b", "Ab") + assert re.match(r"(?i:a)b", "aB") is None + assert re.match(r"(?-i:a)b", "Ab", re.IGNORECASE) is None + assert re.match(r"(?-i:a)b", "aB", re.IGNORECASE) + assert re.match(r"(?i:(?-i:a)b)", "Ab") is None + assert re.match(r"(?i:(?-i:a)b)", "aB") + assert re.match(r"\w(?a:\W)\w", "\xe0\xe0\xe0") + + check_pattern_error(rb"(?aL:a)") + check_pattern_error(r"(?-") + check_pattern_error(r"(?-+") + check_pattern_error(r"(?-z") + check_pattern_error(r"(?-i") + check_pattern_error(r"(?-i+") + check_pattern_error(r"(?-iz") + check_pattern_error(r"(?i:") + check_pattern_error(r"(?i") + check_pattern_error(r"(?i+") + check_pattern_error(r"(?iz") + + +def test_ignore_spaces(): + for space in " \t\n\r\v\f": + assert re.fullmatch(space + "a", "a", re.VERBOSE) + for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f": + assert re.fullmatch(space + b"a", b"a", re.VERBOSE) + assert re.fullmatch("(?x) a", "a") + assert re.fullmatch(" (?x) a", "a", re.VERBOSE) + assert re.fullmatch("(?x) (?x) a", "a") + assert re.fullmatch(" a(?x: b) c", " ab c") + assert re.fullmatch(" a(?-x: b) c", "a bc", re.VERBOSE) + assert re.fullmatch("(?x) a(?-x: b) c", "a bc") + assert re.fullmatch("(?x) a| b", "a") + assert re.fullmatch("(?x) a| b", "b") + + +def test_comments(): + assert re.fullmatch("#x\na", "a", re.VERBOSE) + assert re.fullmatch(b"#x\na", b"a", re.VERBOSE) + assert re.fullmatch("(?x)#x\na", "a") + assert re.fullmatch("#x\n(?x)#y\na", "a", re.VERBOSE) + assert re.fullmatch("(?x)#x\n(?x)#y\na", "a") + assert re.fullmatch("#x\na(?x:#y\nb)#z\nc", "#x\nab#z\nc") + assert re.fullmatch("#x\na(?-x:#y\nb)#z\nc", "a#y\nbc", re.VERBOSE) + assert re.fullmatch("(?x)#x\na(?-x:#y\nb)#z\nc", "a#y\nbc") + assert re.fullmatch("(?x)#x\na|#y\nb", "a") + assert re.fullmatch("(?x)#x\na|#y\nb", "b") + + +def test_bug_6509(): + # Replacement strings of both types must parse properly. + # all strings + assert re.sub(r"a(\w)", "b\\1", "ac") == "bc" + assert re.sub("a(.)", "b\\1", "a\u1234") == "b\u1234" + assert re.sub("..", lambda m: "str", "a5") == "str" + + # all bytes + assert re.sub(rb"a(\w)", b"b\\1", b"ac") == b"bc" + assert re.sub(b"a(.)", b"b\\1", b"a\xcd") == b"b\xcd" + assert re.sub(b"..", lambda m: b"bytes", b"a5") == b"bytes" + + +def test_search_dot_unicode(): + assert re.search("123.*-", "123abc-") + assert re.search("123.*-", "123\xe9-") + assert re.search("123.*-", "123\u20ac-") + assert re.search("123.*-", "123\U0010ffff-") + assert re.search("123.*-", "123\xe9\u20ac\U0010ffff-") + + +def test_compile(): + # Test return value when given string and pattern as parameter + pattern = re.compile("random pattern") + assert isinstance(pattern, re.Pattern) + same_pattern = re.compile(pattern) + assert isinstance(same_pattern, re.Pattern) + assert same_pattern is pattern + # Test behaviour when not given a string or pattern as parameter + assert_raises(TypeError, re.compile, 0) + + +def test_large_search(): + # Issue #10182: indices were 32-bit-truncated. + size = 2 # * 1024 ** 2 # TODO: Works but is expensive for iterative tests + s = "a" * size + m = re.search("$", s) + assert m is not None + assert m.start() == size + assert m.end() == size + + +def test_large_subn(): + # Issue #10182: indices were 32-bit-truncated. + size = 2 # * 1024 ** 2 # TODO: Works but is expensive for iterative tests + s = "a" * size + r, n = re.subn("", "", s) + assert r == s + assert n == size + 1 + + +def test_bug_16688(): + # Issue 16688: Backreferences make case-insensitive regex fail on + # non-ASCII strings. + assert re.findall(r"(?i)(a)\1", "aa \u0100") == ["a"] + assert re.match(r"(?s).{1,3}", "\u0100\u0100").span() == (0, 2) + + +def test_repeat_minmax_overflow(): + # Issue #13169 + string = "x" * 100000 + assert re.match(r".{65535}", string).span() == (0, 65535) + assert re.match(r".{,65535}", string).span() == (0, 65535) + assert re.match(r".{65535,}?", string).span() == (0, 65535) + + +def test_look_behind_overflow(): + string = "x" * 2_500_000 + p1 = r"(?<=((.{%d}){%d}){%d})" + p2 = r"(?" % ( + type(m).__module__, + type(m).__qualname__, + ) + assert re.search(pattern, repr(m)) + for string in ( + b"[abracadabra]", + B(b"[abracadabra]"), + bytearray(b"[abracadabra]"), + memoryview(b"[abracadabra]"), + ): + m = re.search(rb"(.+)(.*?)\1", string) + pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % ( + type(m).__module__, + type(m).__qualname__, + ) + assert re.search(pattern, repr(m)) + + first, second = list(re.finditer("(aa)|(bb)", "aa bb")) + pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % ( + type(second).__module__, + type(second).__qualname__, + ) + assert re.search(pattern, repr(first)) + pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % ( + type(second).__module__, + type(second).__qualname__, + ) + assert re.search(pattern, repr(second)) + + +def test_zerowidth(): + # Issues 852532, 1647489, 3262, 25054. + assert re.split(r"\b", "a::bc") == ["", "a", "::", "bc", ""] + assert re.split(r"\b|:+", "a::bc") == ["", "a", "", "", "bc", ""] + assert re.split(r"(?)") + check_pattern_error(r"(?") + + +def test_enum(): + # Issue #28082: Check that str(flag) returns a human readable string + # instead of an integer + # TODO: Change representation of enums + # self.assertIn("IGNORECASE", str(re.I)) + # self.assertIn("DOTALL", str(re.S)) + pass + + +def test_bug_34294(): + # Issue 34294: wrong capturing groups + # exists since Python 2 + s = "a\tx" + p = r"\b(?=(\t)|(x))x" + assert re.search(p, s).groups() == (None, "x") + + # introduced in Python 3.7.0 + s = "ab" + p = r"(?=(.)(.)?)" + assert re.findall(p, s), [("a", "b") == ("b", "")] + assert [m.groups() for m in re.finditer(p, s)], [("a", "b") == ("b", None)] + + # test-cases provided by issue34294, introduced in Python 3.7.0 + p = r"(?=<(?P\w+)/?>(?:(?P.+?))?)" + s = "" + assert re.findall(p, s), [("test", "") == ("foo2", "")] + assert [m.groupdict() for m in re.finditer(p, s)] == [ + {"tag": "test", "text": ""}, + {"tag": "foo2", "text": None}, + ] + s = "Hello" + assert [m.groupdict() for m in re.finditer(p, s)] == [ + {"tag": "test", "text": "Hello"}, + {"tag": "foo", "text": None}, + ] + s = "Hello" + assert [m.groupdict() for m in re.finditer(p, s)] == [ + {"tag": "test", "text": "Hello"}, + {"tag": "foo", "text": None}, + {"tag": "foo", "text": None}, + ] + + +def test_MARK_PUSH_macro_bug(): + # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it + # was the only available mark. + assert re.match(r"(ab|a)*?b", "ab").groups() == ("a",) + assert re.match(r"(ab|a)+?b", "ab").groups() == ("a",) + assert re.match(r"(ab|a){0,2}?b", "ab").groups() == ("a",) + assert re.match(r"(.b|a)*?b", "ab").groups() == ("a",) + + +def test_MIN_UNTIL_mark_bug(): + # Fixed in issue35859, reported in issue9134. + # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat + s = "axxzbcz" + p = r"(?:(?:a|bc)*?(xx)??z)*" + assert re.match(p, s).groups() == ("xx",) + + # test-case provided by issue9134 + s = "xtcxyzxc" + p = r"((x|yz)+?(t)??c)*" + m = re.match(p, s) + assert m.span() == (0, 8) + assert m.span(2) == (6, 7) + assert m.groups() == ("xyzxc", "x", "t") + + +def test_REPEAT_ONE_mark_bug(): + # issue35859 + # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat + s = "aabaab" + p = r"(?:[^b]*a(?=(b)|(a))ab)*" + m = re.match(p, s) + assert m.span() == (0, 6) + assert m.span(2) == (4, 5) + assert m.groups() == (None, "a") + + # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat + s = "abab" + p = r"(?:[^b]*(?=(b)|(a))ab)*" + m = re.match(p, s) + assert m.span() == (0, 4) + assert m.span(2) == (2, 3) + assert m.groups() == (None, "a") + + assert re.match(r"(ab?)*?b", "ab").groups() == ("a",) + + +def test_MIN_REPEAT_ONE_mark_bug(): + # issue35859 + # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat + s = "abab" + p = r"(?:.*?(?=(a)|(b))b)*" + m = re.match(p, s) + assert m.span() == (0, 4) + assert m.span(2) == (3, 4) + assert m.groups() == (None, "b") + + s = "axxzaz" + p = r"(?:a*?(xx)??z)*" + assert re.match(p, s).groups() == ("xx",) + + +def test_ASSERT_NOT_mark_bug(): + # Fixed in issue35859, reported in issue725149. + # JUMP_ASSERT_NOT should LASTMARK_SAVE() + assert re.match(r"(?!(..)c)", "ab").groups() == (None,) + + # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat + m = re.match(r"((?!(ab)c)(.))*", "abab") + assert m.span() == (0, 4) + assert m.span(1) == (3, 4) + assert m.span(3) == (3, 4) + assert m.groups() == ("b", None, "b") + + +def test_bug_40736(): + with pytest.raises(TypeError): + re.search("x*", 5) + with pytest.raises(TypeError): + re.search("x*", type) + + +def test_search_anchor_at_beginning(): + s = "x" * 10**7 + for p in r"\Ay", r"^y": + assert re.search(p, s) is None + assert re.split(p, s) == [s] + assert re.findall(p, s) == [] + assert list(re.finditer(p, s)) == [] + assert re.sub(p, "", s) == s + + +def test_possessive_quantifiers(): + """Test Possessive Quantifiers + Test quantifiers of the form @+ for some repetition operator @, + e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed + without creating a stack frame for rolling the stack back and + trying 1 or more fewer matches.""" + assert re.match("e*+e", "eeee") is None + assert re.match("e++a", "eeea").group(0) == "eeea" + assert re.match("e?+a", "ea").group(0) == "ea" + assert re.match("e{2,4}+a", "eeea").group(0) == "eeea" + assert re.match("(.)++.", "ee") is None + assert re.match("(ae)*+a", "aea").groups() == ("ae",) + assert re.match("([ae][ae])?+a", "aea").groups() == ("ae",) + assert re.match("(e?){2,4}+a", "eeea").groups() == ("",) + assert re.match("()*+a", "a").groups() == ("",) + assert re.search("x*+", "axx").span() == (0, 0) + assert re.search("x++", "axx").span() == (1, 3) + assert re.match("a*+", "xxx").span() == (0, 0) + assert re.match("x*+", "xxxa").span() == (0, 3) + assert re.match("a++", "xxx") is None + assert re.match(r"^(\w){1}+$", "abc") is None + assert re.match(r"^(\w){1,2}+$", "abc") is None + + assert re.match(r"^(\w){3}+$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,3}+$", "abc").group(1) == "c" + assert re.match(r"^(\w){1,4}+$", "abc").group(1) == "c" + + assert re.match("^x{1}+$", "xxx") is None + assert re.match("^x{1,2}+$", "xxx") is None + + assert re.match("^x{3}+$", "xxx") + assert re.match("^x{1,3}+$", "xxx") + assert re.match("^x{1,4}+$", "xxx") + + assert re.match("^x{}+$", "xxx") is None + assert re.match("^x{}+$", "x{}") + + +def test_fullmatch_possessive_quantifiers(): + assert re.fullmatch(r"a++", "a") + assert re.fullmatch(r"a*+", "a") + assert re.fullmatch(r"a?+", "a") + assert re.fullmatch(r"a{1,3}+", "a") + assert re.fullmatch(r"a++", "ab") is None + assert re.fullmatch(r"a*+", "ab") is None + assert re.fullmatch(r"a?+", "ab") is None + assert re.fullmatch(r"a{1,3}+", "ab") is None + assert re.fullmatch(r"a++b", "ab") + assert re.fullmatch(r"a*+b", "ab") + assert re.fullmatch(r"a?+b", "ab") + assert re.fullmatch(r"a{1,3}+b", "ab") + + assert re.fullmatch(r"(?:ab)++", "ab") + assert re.fullmatch(r"(?:ab)*+", "ab") + assert re.fullmatch(r"(?:ab)?+", "ab") + assert re.fullmatch(r"(?:ab){1,3}+", "ab") + assert re.fullmatch(r"(?:ab)++", "abc") is None + assert re.fullmatch(r"(?:ab)*+", "abc") is None + assert re.fullmatch(r"(?:ab)?+", "abc") is None + assert re.fullmatch(r"(?:ab){1,3}+", "abc") is None + assert re.fullmatch(r"(?:ab)++c", "abc") + assert re.fullmatch(r"(?:ab)*+c", "abc") + assert re.fullmatch(r"(?:ab)?+c", "abc") + assert re.fullmatch(r"(?:ab){1,3}+c", "abc") + + +def test_findall_possessive_quantifiers(): + assert re.findall(r"a++", "aab") == ["aa"] + assert re.findall(r"a*+", "aab") == ["aa", "", ""] + assert re.findall(r"a?+", "aab") == ["a", "a", "", ""] + assert re.findall(r"a{1,3}+", "aab") == ["aa"] + + assert re.findall(r"(?:ab)++", "ababc") == ["abab"] + assert re.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""] + assert re.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""] + assert re.findall(r"(?:ab){1,3}+", "ababc") == ["abab"] + + +def test_atomic_grouping(): + """Test Atomic Grouping + Test non-capturing groups of the form (?>...), which does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r"a(?>bc|b)c") + assert pattern1.match("abc") is None + assert pattern1.match("abcc") + assert re.match(r"(?>.*).", "abc") is None + assert re.match(r"(?>x)++", "xxx") + assert re.match(r"(?>x++)", "xxx") + assert re.match(r"(?>x)++x", "xxx") is None + assert re.match(r"(?>x++)x", "xxx") is None + + +def test_fullmatch_atomic_grouping(): + assert re.fullmatch(r"(?>a+)", "a") + assert re.fullmatch(r"(?>a*)", "a") + assert re.fullmatch(r"(?>a?)", "a") + assert re.fullmatch(r"(?>a{1,3})", "a") + assert re.fullmatch(r"(?>a+)", "ab") is None + assert re.fullmatch(r"(?>a*)", "ab") is None + assert re.fullmatch(r"(?>a?)", "ab") is None + assert re.fullmatch(r"(?>a{1,3})", "ab") is None + assert re.fullmatch(r"(?>a+)b", "ab") + assert re.fullmatch(r"(?>a*)b", "ab") + assert re.fullmatch(r"(?>a?)b", "ab") + assert re.fullmatch(r"(?>a{1,3})b", "ab") + + assert re.fullmatch(r"(?>(?:ab)+)", "ab") + assert re.fullmatch(r"(?>(?:ab)*)", "ab") + assert re.fullmatch(r"(?>(?:ab)?)", "ab") + assert re.fullmatch(r"(?>(?:ab){1,3})", "ab") + assert re.fullmatch(r"(?>(?:ab)+)", "abc") is None + assert re.fullmatch(r"(?>(?:ab)*)", "abc") is None + assert re.fullmatch(r"(?>(?:ab)?)", "abc") is None + assert re.fullmatch(r"(?>(?:ab){1,3})", "abc") is None + assert re.fullmatch(r"(?>(?:ab)+)c", "abc") + assert re.fullmatch(r"(?>(?:ab)*)c", "abc") + assert re.fullmatch(r"(?>(?:ab)?)c", "abc") + assert re.fullmatch(r"(?>(?:ab){1,3})c", "abc") + + +def test_findall_atomic_grouping(): + assert re.findall(r"(?>a+)", "aab") == ["aa"] + assert re.findall(r"(?>a*)", "aab") == ["aa", "", ""] + assert re.findall(r"(?>a?)", "aab") == ["a", "a", "", ""] + assert re.findall(r"(?>a{1,3})", "aab") == ["aa"] + + assert re.findall(r"(?>(?:ab)+)", "ababc") == ["abab"] + assert re.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""] + assert re.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""] + assert re.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"] + + +def test_bug_gh91616(): + assert re.fullmatch(r"(?s:(?>.*?\.).*)\z", "a.txt") # reproducer + assert re.fullmatch(r"(?s:(?=(?P.*?\.))(?P=g0).*)\z", "a.txt") + + +def test_bug_gh100061(): + # gh-100061 + assert re.match("(?>(?:.(?!D))+)", "ABCDE").span() == (0, 2) + assert re.match("(?:.(?!D))++", "ABCDE").span() == (0, 2) + assert re.match("(?>(?:.(?!D))*)", "ABCDE").span() == (0, 2) + assert re.match("(?:.(?!D))*+", "ABCDE").span() == (0, 2) + assert re.match("(?>(?:.(?!D))?)", "CDE").span() == (0, 0) + assert re.match("(?:.(?!D))?+", "CDE").span() == (0, 0) + assert re.match("(?>(?:.(?!D)){1,3})", "ABCDE").span() == (0, 2) + assert re.match("(?:.(?!D)){1,3}+", "ABCDE").span() == (0, 2) + # gh-106052 + assert re.match("(?>(?:ab?c)+)", "aca").span() == (0, 2) + assert re.match("(?:ab?c)++", "aca").span() == (0, 2) + assert re.match("(?>(?:ab?c)*)", "aca").span() == (0, 2) + assert re.match("(?:ab?c)*+", "aca").span() == (0, 2) + assert re.match("(?>(?:ab?c)?)", "a").span() == (0, 0) + assert re.match("(?:ab?c)?+", "a").span() == (0, 0) + assert re.match("(?>(?:ab?c){1,3})", "aca").span() == (0, 2) + assert re.match("(?:ab?c){1,3}+", "aca").span() == (0, 2) + + +def test_bug_gh101955(): + # Possessive quantifier with nested alternative with capture groups + assert re.match("((x)|y|z)*+", "xyz").groups() == ("z", "x") + assert re.match("((x)|y|z){3}+", "xyz").groups() == ("z", "x") + assert re.match("((x)|y|z){3,}+", "xyz").groups() == ("z", "x") + + +def test_regression_gh94675(): + # TODO: Multiprocessing requires pickling + pattern = re.compile( + r"(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*" + r"((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))" + r"[^/\[]*)*/))((((//[^\n]*)?[\n])" + r"([\000-\040]|(/\*[^*]*\*+" + r"([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))" + ) + input_js = """a(function() { + /////////////////////////////////////////////////////////////////// + });""" + p = multiprocessing.Process(target=pattern.sub, args=("", input_js)) + p.start() + p.join(30.0) + try: + assert not p.is_alive(), "pattern.sub() timed out" + finally: + if p.is_alive(): + p.terminate() + p.join() + + +def test_fail(): + assert re.search(r"12(?!)|3", "123")[0] == "3" + + +def test_character_set_any(): + # The union of complementary character sets matches any character + # and is equivalent to "(?s:.)". + s = "1x\n" + for p in r"[\s\S]", r"[\d\D]", r"[\w\W]", r"[\S\s]", r"\s|\S": + assert re.findall(p, s) == list(s) + assert re.fullmatch("(?:" + p + ")+", s).group() == s + + +def test_character_set_none(): + # Negation of the union of complementary character sets does not match + # any character. + s = "1x\n" + for p in r"[^\s\S]", r"[^\d\D]", r"[^\w\W]", r"[^\S\s]": + assert re.search(p, s) is None + assert re.search("(?s:.)" + p, s) is None -- 2.30.2